In [29]:
import sys
import psycopg2 as pg2

import pandas as pd
import numpy as np
from pprint import pprint
sys.path.insert(0, "..")
import pg_tools
from dataclasses_att import *

import sqlalchemy
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import extract

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
engine = create_engine(pg_tools.pg_connection_string, echo=False)
'''
sessionmaker() создает функцию для создания сессий ORM.
autocommit=False отключает автоматическое подтверждение транзакций.
autoflush=False отключает автоматическое обновление состояния сессии после каждого запроса.
bind=engine связывает сессию с созданным движком.
'''
SessionMaker = sessionmaker(autocommit=False, autoflush=False, bind=engine)

In [3]:
sql_file = lambda x: f'{x}.sql'
csv_file = lambda x: f'result/{x}.csv'

In [4]:
with SessionMaker() as session:
    film_info = session.query(Film.film_id, Film.title, Film.release_year, Film.rental_duration,
                              Film.rental_rate, Film.length, Film.replacement_cost,
                              Film.rating, Film.special_features, Language.name)
    film_info = film_info.select_from(Film)
    film_info = film_info.join(Language, Film.language_id == Language.language_id)
    df_film_info = pd.read_sql(film_info.statement, engine)
df_film_info

Unnamed: 0,film_id,title,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,name
0,1,ACADEMY DINOSAUR,2006,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",English
1,2,ACE GOLDFINGER,2006,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",English
2,3,ADAPTATION HOLES,2006,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",English
3,4,AFFAIR PREJUDICE,2006,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",English
4,5,AFRICAN EGG,2006,6,2.99,130,22.99,G,Deleted Scenes,English
...,...,...,...,...,...,...,...,...,...,...
995,996,YOUNG LANGUAGE,2006,6,0.99,183,9.99,G,"Trailers,Behind the Scenes",English
996,997,YOUTH KICK,2006,4,0.99,179,14.99,NC-17,"Trailers,Behind the Scenes",English
997,998,ZHIVAGO CORE,2006,6,0.99,105,10.99,NC-17,Deleted Scenes,English
998,999,ZOOLANDER FICTION,2006,5,2.99,101,28.99,R,"Trailers,Deleted Scenes",English


In [59]:
with SessionMaker() as session:
    category_count = session.query(FilmCategory.film_id, sqlalchemy.func.count(FilmCategory.category_id).label("category_count"))
    category_count = category_count.select_from(FilmCategory)
    category_count = category_count.group_by(FilmCategory.film_id)
    category_count = category_count.order_by(sqlalchemy.desc("category_count"))
    df_category_count = pd.read_sql(category_count.statement, engine)
    
    actor_count = session.query(FilmActor.film_id, sqlalchemy.func.count(FilmActor.actor_id).label("actor_count"))
    actor_count = actor_count.select_from(FilmActor)
    actor_count = actor_count.group_by(FilmActor.film_id)
    actor_count = actor_count.order_by(sqlalchemy.desc("actor_count"))
    df_actor_count = pd.read_sql(actor_count.statement, engine)    
    
    film_sales = session.query(Film.film_id, 
                               sqlalchemy.func.sum(Payment.amount).label("amount"),
                               sqlalchemy.func.count(Payment.amount).label("count_payment"))
    film_sales = film_sales.select_from(Film)
    film_sales = film_sales.outerjoin(Inventory, Film.film_id == Inventory.film_id)
    film_sales = film_sales.outerjoin(Rental, Inventory.inventory_id == Rental.inventory_id)
    film_sales = film_sales.outerjoin(Payment, Rental.rental_id == Payment.rental_id)
    film_sales = film_sales.group_by(Film.film_id)
    df_film_sales = pd.read_sql(film_sales.statement, engine)
    
    film_info = session.query(Film.film_id, Film.title, Film.release_year, Film.rental_duration,
                              Film.rental_rate, Film.length, Film.replacement_cost,
                              Film.rating, Film.special_features, Language.name.label("language"))
    film_info = film_info.select_from(Film)
    film_info = film_info.join(Language, Film.language_id == Language.language_id)
    df_film_info = pd.read_sql(film_info.statement, engine)    

In [82]:
df = df_film_info.merge(df_film_sales, on="film_id", how='left')
df = df.merge(df_category_count, on='film_id', how='left')
df = df.merge(df_actor_count, on='film_id', how='left')
df.to_csv(csv_file("task3-df"))
df.sample(10)

Unnamed: 0,film_id,title,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,language,amount,count_payment,category_count,actor_count
109,182,CONTROL ANTHEM,2006,7,4.99,185,9.99,G,Commentaries,English,36.93,7,1,3.0
909,909,TREASURE COMMAND,2006,3,0.99,102,28.99,PG-13,"Trailers,Commentaries,Deleted Scenes,Behind th...",English,,0,1,7.0
985,986,WONKA SEA,2006,6,2.99,85,24.99,NC-17,"Trailers,Commentaries",English,66.82,18,1,2.0
569,570,MERMAID INSECTS,2006,5,4.99,104,20.99,NC-17,"Trailers,Behind the Scenes",English,99.84,16,1,6.0
406,407,HAWK CHILL,2006,5,0.99,47,12.99,PG-13,Behind the Scenes,English,15.93,7,1,7.0
227,228,DETECTIVE VISION,2006,4,0.99,143,16.99,PG-13,"Trailers,Commentaries,Behind the Scenes",English,68.73,27,1,3.0
340,341,FROST HEAD,2006,5,0.99,82,13.99,PG,"Trailers,Deleted Scenes",English,76.7,30,1,7.0
849,850,STORY SIDE,2006,7,0.99,163,27.99,R,"Trailers,Behind the Scenes",English,39.72,28,1,5.0
711,712,RAIDERS ANTITRUST,2006,4,0.99,82,11.99,PG-13,Deleted Scenes,English,,0,1,6.0
661,662,PATHS CONTROL,2006,3,4.99,118,9.99,PG,"Trailers,Behind the Scenes",English,81.9,10,1,5.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   film_id           1000 non-null   int64  
 1   title             1000 non-null   object 
 2   release_year      1000 non-null   object 
 3   rental_duration   1000 non-null   int64  
 4   rental_rate       1000 non-null   float64
 5   length            1000 non-null   int64  
 6   replacement_cost  1000 non-null   float64
 7   rating            1000 non-null   object 
 8   special_features  1000 non-null   object 
 9   name              1000 non-null   object 
 10  amount            958 non-null    float64
 11  count_payment     1000 non-null   int64  
 12  category_count    1000 non-null   int64  
 13  actor_count       997 non-null    float64
dtypes: float64(4), int64(5), object(5)
memory usage: 109.5+ KB


In [62]:
numeric_columns = ['rental_duration', 'rental_rate', 'length', 'replacement_cost', 'amount', 'count_payment', 'category_count', 'actor_count']
df_numeric = df[numeric_columns].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])
# вычислить дисперсию
columns_var = []
for column in numeric_columns:
    columns_var.append(df[column].var())
df_numeric.loc['var'] = columns_var

# доля пропусков
columns_nan = []
total_count = len(df)
for column in numeric_columns:
    columns_nan.append((len(df[df[column].isna()]) / total_count) * 100)
df_numeric.loc['nan%'] = columns_nan
df_numeric.to_csv(csv_file("task3-numeric"))
df_numeric

Unnamed: 0,rental_duration,rental_rate,length,replacement_cost,amount,count_payment,category_count,actor_count
count,1000.0,1000.0,1000.0,1000.0,958.0,1000.0,1000.0,997.0
mean,4.985,2.98,115.272,19.984,70.361754,16.044,1.0,5.478435
std,1.411654,1.646393,40.426332,6.050833,42.289777,7.348065,0.0,2.321712
min,3.0,0.99,46.0,9.99,5.94,0.0,1.0,1.0
10%,3.0,0.99,60.0,11.99,22.87,7.0,1.0,3.0
25%,4.0,0.99,80.0,14.99,37.7425,11.0,1.0,4.0
50%,5.0,2.99,114.0,19.99,62.32,16.0,1.0,5.0
75%,6.0,4.99,149.25,24.99,95.755,21.0,1.0,7.0
90%,7.0,4.99,173.0,27.99,130.803,26.0,1.0,8.0
max,7.0,4.99,185.0,29.99,231.73,34.0,1.0,15.0


In [80]:
cat_columns = ['rating', 'special_features', 'language', 'release_year']

columns_nan = []
columns_mode = []
columns_unique = []
total_count = len(df)
for column in cat_columns:
    columns_nan.append((len(df[df[column].isna()]) / total_count) * 100)
    columns_mode.append(df[column].mode()[0])
    columns_unique.append(df[column].nunique())
df_cat = pd.DataFrame(data = [columns_nan, columns_mode, columns_unique], 
                      index = ['%nan', 'mode', 'nunique'],
                      columns = cat_columns)
df_cat.to_csv(csv_file("task3-cat"))
df_cat

Unnamed: 0,rating,special_features,language,release_year
%nan,0.0,0.0,0.0,0.0
mode,PG-13,"Trailers,Commentaries,Behind the Scenes",English,2006.0
nunique,5,15,1,1.0
