# DAY 78
scikit learn and seaborn introduction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression as LR

pd.options.display.float_format = '{:,.2f}'.format #apresentacao de valores

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

data = pd.read_csv('cost_revenue_dirty.csv')

In [None]:
#limpeza de caracteres especiais
drop_char = [',', '$'] #characters to remove
col = ['USD_Production_Budget',
       'USD_Worldwide_Gross',
       'USD_Domestic_Gross']
for c in col:
    for char in drop_char:
        data[c] = data[c].astype(str).str.replace(char, "")
    data[c] = pd.to_numeric(data[c])


In [None]:
data.Release_Date = pd.to_datetime(data.Release_Date) #transforma de objeto para date format
data.info()

In [None]:
data.describe() #interessante para ver os valores de tendencia central da tabela

In [None]:
#calculando os zeros do dataset
zero_world = data[data.USD_Worldwide_Gross == 0]
print(f'films that grossed $0 worldwide {len(zero_world)}')
#zero_worldwide.sort_values('USD_Production_Budget', ascending=False)

zero_domestic = data[data.USD_Domestic_Gross == 0]
print(f'films that grossed $0 domestically {len(zero_domestic)}')
#zero_domestic.sort_values('USD_Production_Budget', ascending=False)

In [None]:
#multiple arguments to compare
int_releases = data.query('USD_Domestic_Gross == 0 and USD_Worldwide_Gross != 0')
print(f'international releases: {len(int_releases)}')
int_releases.tail()

In [None]:
#remoçao de filmes não lançados até a coleta de dados
coleta_date = pd.Timestamp('2018-5-1')
future_releases = data[data.Release_Date >= coleta_date]
print(f'unreleased films: {len(future_releases)}')

data_clean = data.drop(future_releases.index)

In [None]:
#filmes que perderam dinheiro
money_lost = data_clean.loc[data_clean.USD_Production_Budget > data_clean.USD_Worldwide_Gross] #budget maior que revenue
print(f'{round(((len(money_lost)/len(data_clean))*100),2)}% dos filmes perderam dinheiro')

##Seaborn


In [None]:
# sns.scatterplot(data=data_clean,
#                 x='USD_Production_Budget',
#                 y='USD_Worldwide_Gross')

plt.figure(figsize=(8,4), dpi=200)
#normal scatter graph
axis = sns.scatterplot(data=data_clean,
                     x='USD_Production_Budget',
                     y='USD_Worldwide_Gross')

axis.set(ylim=(0, 3000000000),
       xlim=(0, 450000000),
       ylabel='Revenue in $ billions',
       xlabel='Budget in $100 millions')

plt.show()

In [None]:
plt.figure(figsize=(8,4), dpi=200)
axis = sns.scatterplot(data=data_clean, #updating the scatter graph to look like bubble graph
                     x='USD_Production_Budget',
                     y='USD_Worldwide_Gross',
                     hue='USD_Worldwide_Gross', # change the colour by gross
                     size='USD_Worldwide_Gross',) # change dot size bu gross

axis.set(ylim=(0, 3000000000),
       xlim=(0, 450000000),
       ylabel='Revenue in $ billions',
       xlabel='Budget in $100 millions',)

plt.show()

In [None]:
#outro bubble graph mas estilizado
plt.figure(figsize=(8,4), dpi=200)

#styling do graph
with sns.axes_style('darkgrid'):
  axis = sns.scatterplot(data=data_clean,
                       x='USD_Production_Budget',
                       y='USD_Worldwide_Gross',
                       hue='USD_Worldwide_Gross', #quanto mais escuro mais gross revenue
                       size='USD_Worldwide_Gross') #quanto maior mais gross revenue

  axis.set(ylim=(0, 3000000000),
        xlim=(0, 450000000),
        ylabel='Revenue in $ billions',
        xlabel='Budget in $100 millions')

In [None]:
plt.figure(figsize=(8,4), dpi=200)

with sns.axes_style("darkgrid"):
    axis = sns.scatterplot(data=data_clean,
                    x='Release_Date',
                    y='USD_Production_Budget',
                    hue='USD_Worldwide_Gross',
                    size='USD_Worldwide_Gross',)

    axis.set(ylim=(0, 450000000),
           xlim=(data_clean.Release_Date.min(), data_clean.Release_Date.max()), #adiçao do parametro data = grafico temporal
           xlabel='Year',
           ylabel='Budget in $100 millions')

In [None]:
#criando nova coluna no dataframe para agrupamento por decadas
dt_index = pd.DatetimeIndex(data_clean.Release_Date)
years = dt_index.year
decades = years//10*10
data_clean['Decade'] = decades

#novos dataframes com filmes antes e dps da decada de 60
old_films = data_clean[data_clean.Decade <= 1960]
new_films = data_clean[data_clean.Decade > 1960]

data_clean.sample() #verificando se deu boa

In [None]:
#regressao linear de budgets vs revenues em filmes antigos USANDO SNS.REGPLOT
# sns.regplot(data=old_films,
#             x='USD_Production_Budget',
#             y='USD_Worldwide_Gross')

plt.figure(figsize=(8,4), dpi=200)
with sns.axes_style("whitegrid"):
  sns.regplot(data=old_films,
            x='USD_Production_Budget',
            y='USD_Worldwide_Gross',
            scatter_kws = {'alpha': 0.4},
            line_kws = {'color': 'red'})

In [None]:
#regressao linear de budgets vs revenues em filmes novos USANDO SNS.REGPLOT
plt.figure(figsize=(8,4), dpi=200)
with sns.axes_style('darkgrid'):
  ax = sns.regplot(data=new_films,
                   x='USD_Production_Budget',
                   y='USD_Worldwide_Gross',
                   color='#2f4b7c',
                   scatter_kws = {'alpha': 0.3},
                   line_kws = {'color': 'orange'})

  ax.set(ylim=(0, 3000000000),
         xlim=(0, 450000000),
         ylabel='Revenue in $ billions',
         xlabel='Budget in $100 millions')

##Scikit Learn

In [None]:
#REGRESãO LINERAR BY SCIKIT LEARN com dados de revenues
regression = LR()
#capital X = FEATURES
X = pd.DataFrame(new_films, columns=['USD_Production_Budget']) #alimentando as infos de x

#y lowercase = TARGET
y = pd.DataFrame(new_films, columns=['USD_Worldwide_Gross']) #alimentando as infos de y

regression.fit(X, y)

#VALOR DE THETA ZERO = INTERCEPT =  how much a movie would make if the budget was 0.
print(regression.intercept_)
#VALOR DE THETA ONE = SLOPE =  how much more a movie would make if the budget were increased by $1
print(regression.coef_)

# Regression-squared
print(f'{round((regression.score(X, y))*100)}%') #the model explains about this % of the variance in movie revenue.

In [None]:
#regressão linear baseada num budget personalizado e nos dados acima CONSIDERE O X e Y do bloco superior
budget = 300000000 #Million USD
revenue_estimate = regression.intercept_[0] + regression.coef_[0,0]*budget
revenue_estimate = round(revenue_estimate, -6)

#formatacao textual
s = f"{revenue_estimate:,.0f}"
print(f'The estimated revenue for a $300M film is around ${s}.')