In [4]:
import sys
import psycopg2 as pg2

import pandas as pd
import numpy as np
from pprint import pprint
sys.path.insert(0, "..")
import pg_tools
from dataclasses_att import *

import sqlalchemy
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import extract

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import dotenv_values


In [5]:
# Параметры подключения к БД
config = {
    **dotenv_values("../.env")
}
#pg_connection_string = f'postgresql+psycopg2://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_database}'
pg_connection_string = f'postgresql://{config["PG_USER"]}:{config["PG_PASSWORD"]}@{config["PG_HOST"]}:{config["PG_PORT"]}/{config["PG_DATABASE"]}'

In [6]:
engine = create_engine(pg_connection_string, echo=False)
'''
sessionmaker() создает функцию для создания сессий ORM.
autocommit=False отключает автоматическое подтверждение транзакций.
autoflush=False отключает автоматическое обновление состояния сессии после каждого запроса.
bind=engine связывает сессию с созданным движком.
'''
SessionMaker = sessionmaker(autocommit=False, autoflush=False, bind=engine)

In [7]:
sql_file = lambda x: f'{x}.sql'
csv_file = lambda x: f'result/{x}.csv'

# Подготовить dataframe с информацией о фильмах

In [8]:
with SessionMaker() as session:
    # собрать базовую инфу по фильмам
    film_info = session.query(Film.film_id, Film.title, Film.release_year, Film.rental_duration,
                              Film.rental_rate, Film.length, Film.replacement_cost,
                              Film.rating, Film.special_features, Language.name.label("language"))
    film_info = film_info.select_from(Film)
    film_info = film_info.join(Language, Film.language_id == Language.language_id)
    df_film_info = pd.read_sql(film_info.statement, engine)    
    
    # посчитать количество категорий по каждому фильму
    category_count = session.query(FilmCategory.film_id, sqlalchemy.func.count(FilmCategory.category_id).label("category_count"))
    category_count = category_count.select_from(FilmCategory)
    category_count = category_count.group_by(FilmCategory.film_id)
    category_count = category_count.order_by(sqlalchemy.desc("category_count"))
    df_category_count = pd.read_sql(category_count.statement, engine)
    
    # посчитать количество актёров по каждому фильму
    actor_count = session.query(FilmActor.film_id, sqlalchemy.func.count(FilmActor.actor_id).label("actor_count"))
    actor_count = actor_count.select_from(FilmActor)
    actor_count = actor_count.group_by(FilmActor.film_id)
    actor_count = actor_count.order_by(sqlalchemy.desc("actor_count"))
    df_actor_count = pd.read_sql(actor_count.statement, engine)    
    
    # посчитать показатели продаж по каждому фильму
    film_sales = session.query(Film.film_id, 
                               sqlalchemy.func.sum(Payment.amount).label("amount"),
                               sqlalchemy.func.count(Payment.amount).label("count_payment"))
    film_sales = film_sales.select_from(Film)
    film_sales = film_sales.outerjoin(Inventory, Film.film_id == Inventory.film_id)
    film_sales = film_sales.outerjoin(Rental, Inventory.inventory_id == Rental.inventory_id)
    film_sales = film_sales.outerjoin(Payment, Rental.rental_id == Payment.rental_id)
    film_sales = film_sales.group_by(Film.film_id)
    df_film_sales = pd.read_sql(film_sales.statement, engine)


In [9]:
# объеденить всю информацию по фильмам в один датафрейм
df = df_film_info.merge(df_film_sales, on="film_id", how='left')
df = df.merge(df_category_count, on='film_id', how='left')
df = df.merge(df_actor_count, on='film_id', how='left')
df.to_csv(csv_file("task3-df"))
df.sample(10)

Unnamed: 0,film_id,title,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features,language,amount,count_payment,category_count,actor_count
668,669,PEARL DESTINY,2006,3,2.99,74,10.99,NC-17,Trailers,English,,0,1,4.0
633,634,ODDS BOOGIE,2006,6,0.99,48,14.99,NC-17,"Trailers,Commentaries,Behind the Scenes",English,23.85,15,1,5.0
899,899,TOWERS HURRICANE,2006,7,0.99,144,14.99,NC-17,"Commentaries,Behind the Scenes",English,12.89,11,1,4.0
291,291,EVOLUTION ALTER,2006,5,0.99,174,10.99,PG-13,Behind the Scenes,English,26.88,12,1,6.0
695,696,PRIDE ALAMO,2006,6,0.99,114,20.99,NC-17,Deleted Scenes,English,18.84,16,1,4.0
987,988,WORKER TARZAN,2006,7,2.99,139,26.99,R,"Trailers,Commentaries,Behind the Scenes",English,50.85,15,1,9.0
248,249,DRACULA CRYSTAL,2006,7,0.99,176,26.99,G,Commentaries,English,24.79,21,1,13.0
412,413,HEDWIG ALTER,2006,7,2.99,169,16.99,NC-17,"Trailers,Commentaries,Behind the Scenes",English,37.88,12,1,4.0
575,576,MIGHTY LUCK,2006,7,2.99,122,13.99,PG,Behind the Scenes,English,38.88,12,1,8.0
874,875,TALENTED HOMICIDE,2006,6,0.99,173,9.99,PG,"Commentaries,Deleted Scenes,Behind the Scenes",English,39.71,29,1,5.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   film_id           1000 non-null   int64  
 1   title             1000 non-null   object 
 2   release_year      1000 non-null   object 
 3   rental_duration   1000 non-null   int64  
 4   rental_rate       1000 non-null   float64
 5   length            1000 non-null   int64  
 6   replacement_cost  1000 non-null   float64
 7   rating            1000 non-null   object 
 8   special_features  1000 non-null   object 
 9   language          1000 non-null   object 
 10  amount            958 non-null    float64
 11  count_payment     1000 non-null   int64  
 12  category_count    1000 non-null   int64  
 13  actor_count       997 non-null    float64
dtypes: float64(4), int64(5), object(5)
memory usage: 109.5+ KB


# Анализ числовых столбцов

In [11]:
numeric_columns = ['rental_duration', 'rental_rate', 'length', 'replacement_cost', 'amount', 'count_payment', 'category_count', 'actor_count']
df_numeric = df[numeric_columns].describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])
# вычислить дисперсию
columns_var = []
for column in numeric_columns:
    columns_var.append(df[column].var())
df_numeric.loc['var'] = columns_var

# доля пропусков
columns_nan = []
total_count = len(df)
for column in numeric_columns:
    columns_nan.append((len(df[df[column].isna()]) / total_count) * 100)
df_numeric.loc['nan%'] = columns_nan
df_numeric.to_csv(csv_file("task3-numeric"))
df_numeric

Unnamed: 0,rental_duration,rental_rate,length,replacement_cost,amount,count_payment,category_count,actor_count
count,1000.0,1000.0,1000.0,1000.0,958.0,1000.0,1000.0,997.0
mean,4.985,2.98,115.272,19.984,70.361754,16.044,1.0,5.478435
std,1.411654,1.646393,40.426332,6.050833,42.289777,7.348065,0.0,2.321712
min,3.0,0.99,46.0,9.99,5.94,0.0,1.0,1.0
10%,3.0,0.99,60.0,11.99,22.87,7.0,1.0,3.0
25%,4.0,0.99,80.0,14.99,37.7425,11.0,1.0,4.0
50%,5.0,2.99,114.0,19.99,62.32,16.0,1.0,5.0
75%,6.0,4.99,149.25,24.99,95.755,21.0,1.0,7.0
90%,7.0,4.99,173.0,27.99,130.803,26.0,1.0,8.0
max,7.0,4.99,185.0,29.99,231.73,34.0,1.0,15.0


# Анализ категориальных данных

In [12]:
cat_columns = ['rating', 'special_features', 'language', 'release_year']

columns_nan = []
columns_mode = []
columns_unique = []
total_count = len(df)
for column in cat_columns:
    columns_nan.append((len(df[df[column].isna()]) / total_count) * 100)
    columns_mode.append(df[column].mode()[0])
    columns_unique.append(df[column].nunique())
df_cat = pd.DataFrame(data = [columns_nan, columns_mode, columns_unique], 
                      index = ['%nan', 'mode', 'nunique'],
                      columns = cat_columns)
df_cat.to_csv(csv_file("task3-cat"))
df_cat

Unnamed: 0,rating,special_features,language,release_year
%nan,0.0,0.0,0.0,0.0
mode,PG-13,"Trailers,Commentaries,Behind the Scenes",English,2006.0
nunique,5,15,1,1.0
