# Análisis de regresión

In [1]:
import numpy as np
import pandas as pd
import cufflinks as cf
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.metrics import accuracy_score, roc_auc_score, mean_absolute_error, r2_score

cf.go_offline()
pd.set_option("display.max_columns", 50)
pd.set_option('display.float_format', lambda x: "{:,.2f}".format(x))

## Funciones relevantes

In [2]:
def normalize_content(texto):
    if texto in ['G', 'TV-G']:
        return "Publico General"
    if texto in ['R', 'NC-17', 'X']:
        return "Restricted"
    if texto in ['M', 'PG', 'GP', 'PG-13','TV-PG','TV-14'] :
        return "Parental Guiadance"
    if texto in ['Approved', 'Passed']:
        return "Approved"
    if texto in ['Not Rated', 'Unrated']:
        return "Not Rated"

In [3]:
def plot_histogram(df, feature):
    return df[[feature]].iplot(kind="hist", title = f"{feature} histogram", colors=["#296EAA"])

## Carga de datos

In [4]:
df = pd.read_csv("../data/movies.csv")

In [5]:
df.shape

(5043, 16)

In [6]:
df.sample(5)

Unnamed: 0,movie_title,movie_imdb_link,color,genre_4,duration,gross,genre_1,genre_2,genre_3,num_voted_users,facenumber_in_poster,language,country,content_rating,title_year,imdb_score
3632,The Kingdom,http://www.imdb.com/title/tt0431197/?ref_=fn_t...,Color,,110.0,47456450.0,Action,Drama,Thriller,101386,1,English,USA,R,2007.0,7.1
2081,Domestic Disturbance,http://www.imdb.com/title/tt0249478/?ref_=fn_t...,Color,,89.0,,Crime,Mystery,Thriller,21283,1,English,USA,PG-13,2001.0,5.5
1596,Hero,http://www.imdb.com/title/tt0299977/?ref_=fn_t...,Black and White,,80.0,84961.0,Action,Adventure,History,149414,4,Mandarin,China,PG-13,2002.0,7.9
2492,The Ring Two,http://www.imdb.com/title/tt0377109/?ref_=fn_t...,Color,,128.0,75888270.0,Horror,Mystery,,71153,0,English,USA,PG-13,2005.0,5.4
1274,The Tempest,http://www.imdb.com/title/tt1274300/?ref_=fn_t...,Color,Romance,110.0,,Comedy,Drama,Fantasy,6147,2,English,USA,PG-13,2010.0,5.4


## Análisis exploratorio

In [7]:
ls_disc = ["color", "genre_4", "genre_1", "genre_2", "genre_3", "language", "country", "content_rating"]
ls_cont = ["duration", "gross", "num_voted_users", "facenumber_in_poster", "title_year"]
target = "imdb_score"

In [8]:
plot_histogram(df=df, feature=target)

In [9]:
df[ls_cont+[target]].describe(percentiles = [0.1, 0.5, 0.95, 0.99])

Unnamed: 0,duration,gross,title_year,imdb_score
count,5028.0,1505.0,4935.0,5043.0
mean,107.2,47523599.37,2042.59,6.46
std,25.2,70034507.15,2818.52,1.71
min,7.0,721.0,1916.0,1.6
10%,86.0,335377.2,1988.0,5.0
50%,103.0,24792061.0,2005.0,6.6
95%,146.0,177159421.4,2015.0,8.1
99%,189.0,336530303.0,2016.0,8.5
max,511.0,760505847.0,200000.0,98.0


In [10]:
df[df["gross"] >= 7.6*10**8]

Unnamed: 0,movie_title,movie_imdb_link,color,genre_4,duration,gross,genre_1,genre_2,genre_3,num_voted_users,facenumber_in_poster,language,country,content_rating,title_year,imdb_score
4425,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,Color,Sci-Fi,178.0,760505847.0,Action,Adventure,Fantasy,886204,0,English,USA,PG-13,2009.0,7.9


In [11]:
df[df[target] == 98]

Unnamed: 0,movie_title,movie_imdb_link,color,genre_4,duration,gross,genre_1,genre_2,genre_3,num_voted_users,facenumber_in_poster,language,country,content_rating,title_year,imdb_score
3319,Sanctuary,nudity|party|pirate|swimsuit|three word title,Color,,82.0,,Comedy,Horror,Thriller,Quite a Conundrum,John Lucas,8,English,USA,200000.0,98.0


In [12]:
df = df[df[target] != 98].reset_index(drop = True)

In [13]:
df["num_voted_users"] = df["num_voted_users"].astype(float)

In [14]:
df[ls_cont+[target]].describe(percentiles = [0.1, 0.5, 0.95, 0.99])

Unnamed: 0,duration,gross,num_voted_users,title_year,imdb_score
count,5027.0,1505.0,5042.0,4934.0,5042.0
mean,107.21,47523599.37,83684.73,2002.47,6.44
std,25.2,70034507.15,138493.99,12.48,1.13
min,7.0,721.0,5.0,1916.0,1.6
10%,86.0,335377.2,1649.7,1988.0,5.0
50%,103.0,24792061.0,34371.0,2005.0,6.6
95%,146.0,177159421.4,332265.45,2015.0,8.09
99%,189.0,336530303.0,681112.44,2016.0,8.5
max,511.0,760505847.0,1689764.0,2016.0,9.5


In [15]:
for col in ls_disc:
    display(df[col].value_counts(True).reset_index())

Unnamed: 0,index,color
0,Color,0.96
1,Black and White,0.04


Unnamed: 0,index,genre_4
0,Thriller,0.34
1,Romance,0.13
2,Family,0.1
3,Sci-Fi,0.08
4,Fantasy,0.07
5,Mystery,0.05
6,War,0.04
7,Drama,0.04
8,Sport,0.03
9,History,0.03


Unnamed: 0,index,genre_1
0,Comedy,0.26
1,Action,0.23
2,Drama,0.19
3,Adventure,0.09
4,Crime,0.07
5,Biography,0.05
6,Horror,0.05
7,Documentary,0.02
8,Animation,0.01
9,Fantasy,0.01


Unnamed: 0,index,genre_2
0,Drama,0.27
1,Adventure,0.11
2,Crime,0.09
3,Romance,0.09
4,Comedy,0.08
5,Horror,0.05
6,Thriller,0.05
7,Mystery,0.05
8,Family,0.04
9,Fantasy,0.04


Unnamed: 0,index,genre_3
0,Thriller,0.18
1,Romance,0.15
2,Drama,0.12
3,Sci-Fi,0.09
4,Fantasy,0.07
5,Family,0.06
6,Comedy,0.06
7,Mystery,0.06
8,Crime,0.04
9,Horror,0.03


Unnamed: 0,index,language
0,English,0.93
1,French,0.01
2,Spanish,0.01
3,Hindi,0.01
4,Mandarin,0.01
5,German,0.0
6,Japanese,0.0
7,Russian,0.0
8,Italian,0.0
9,Cantonese,0.0


Unnamed: 0,index,country
0,USA,0.76
1,UK,0.09
2,France,0.03
3,Canada,0.03
4,Germany,0.02
...,...,...
60,Kyrgyzstan,0.00
61,Egypt,0.00
62,Afghanistan,0.00
63,Soviet Union,0.00


Unnamed: 0,index,content_rating
0,R,0.45
1,PG-13,0.31
2,PG,0.15
3,Not Rated,0.02
4,G,0.02
5,Unrated,0.01
6,Approved,0.01
7,TV-14,0.01
8,TV-MA,0.0
9,X,0.0


In [16]:
df[ls_cont+ls_disc].isnull().sum() / len(df)

duration               0.00
gross                  0.70
num_voted_users        0.00
facenumber_in_poster   0.00
title_year             0.02
color                  0.00
genre_4                0.72
genre_1                0.00
genre_2                0.13
genre_3                0.39
language               0.00
country                0.00
content_rating         0.06
dtype: float64

In [17]:
ls_drop = ["gross", "genre_4", "genre_3", "title_year"]

In [18]:
ls_cont = [x for x in ls_cont if x not in ls_drop]
ls_disc = [x for x in ls_disc if x not in ls_drop]

In [19]:
df = df.dropna(subset = ls_cont+ls_disc).reset_index(drop = True)

In [20]:
df.shape

(4168, 16)

In [21]:
df.sample(10)

Unnamed: 0,movie_title,movie_imdb_link,color,genre_4,duration,gross,genre_1,genre_2,genre_3,num_voted_users,facenumber_in_poster,language,country,content_rating,title_year,imdb_score
3882,Dear Frankie,http://www.imdb.com/title/tt0377752/?ref_=fn_t...,Color,,105.0,1340891.0,Drama,Romance,,18940.0,0,English,UK,PG-13,2004.0,7.8
1104,Reindeer Games,http://www.imdb.com/title/tt0184858/?ref_=fn_t...,Color,Drama,124.0,,Action,Adventure,Crime,31113.0,0,English,USA,R,2000.0,5.7
2813,Jaws,http://www.imdb.com/title/tt0073195/?ref_=fn_t...,Color,,130.0,260000000.0,Adventure,Drama,Thriller,412454.0,0,English,USA,PG,1975.0,8.0
1554,The Work and the Glory,http://www.imdb.com/title/tt0410454/?ref_=fn_t...,Color,,118.0,,Drama,Romance,,833.0,1,English,USA,PG,2004.0,6.9
617,The Sorcerer's Apprentice,http://www.imdb.com/title/tt0963966/?ref_=fn_t...,Color,Fantasy,109.0,63143812.0,Action,Adventure,Family,124185.0,0,English,USA,PG,2010.0,6.1
751,Ride Along,http://www.imdb.com/title/tt1408253/?ref_=fn_t...,Color,Romance,99.0,134141530.0,Action,Comedy,Crime,75903.0,1,English,USA,PG-13,2014.0,6.2
1095,Mercury Rising,http://www.imdb.com/title/tt0120749/?ref_=fn_t...,Color,Thriller,111.0,,Action,Crime,Drama,54316.0,1,English,USA,R,1998.0,6.1
3379,The Legend of Hercules,http://www.imdb.com/title/tt1043726/?ref_=fn_t...,Color,,99.0,,Action,Adventure,Fantasy,44891.0,0,English,USA,PG-13,2014.0,4.2
4108,Dead Like Me: Life After Death,http://www.imdb.com/title/tt1079444/?ref_=fn_t...,Color,,87.0,,Comedy,Drama,Fantasy,10734.0,1,English,USA,R,2009.0,6.1
267,Harold & Kumar Go to White Castle,http://www.imdb.com/title/tt0366551/?ref_=fn_t...,Color,,88.0,,Adventure,Comedy,,155262.0,2,English,USA,R,2004.0,7.1


In [22]:
df["content_rating"] = df["content_rating"].map(normalize_content)

## Modelado

### Preparación de sets

In [23]:
X = df.drop(columns=ls_disc)[ls_cont].join(pd.get_dummies(df[ls_disc]))
y = df[target]

In [24]:
X.shape

(4168, 134)

In [25]:
kb = SelectKBest(k = 5, score_func=f_regression)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=0)

In [27]:
kb.fit(X_train, y_train)


invalid value encountered in true_divide



SelectKBest(k=5, score_func=<function f_regression at 0x7f886ce1f820>)

In [28]:
ls_best = [x for x, y in zip(X.columns, kb.get_support()) if y]

In [29]:
X.columns

Index(['duration', 'num_voted_users', 'facenumber_in_poster',
       'color_ Black and White', 'color_Color', 'genre_1_Action',
       'genre_1_Adventure', 'genre_1_Animation', 'genre_1_Biography',
       'genre_1_Comedy',
       ...
       'country_Taiwan', 'country_Thailand', 'country_UK', 'country_USA',
       'country_West Germany', 'content_rating_Approved',
       'content_rating_Not Rated', 'content_rating_Parental Guiadance',
       'content_rating_Publico General', 'content_rating_Restricted'],
      dtype='object', length=134)

In [30]:
ls_best

['duration',
 'num_voted_users',
 'color_Color',
 'genre_1_Biography',
 'genre_2_Drama']

In [31]:
X_train=X_train[ls_best]

In [32]:
X_test=X_test[ls_best]

### Modelos

In [33]:
linreg = LinearRegression()

In [34]:
linreg.fit(X_train, y_train)

LinearRegression()

In [35]:
linreg.predict(X_test)

array([7.43535462, 8.68256888, 6.67504051, ..., 6.14048016, 7.41992002,
       7.42246765])

In [36]:
r2_score(y_pred=linreg.predict(X_test), y_true=y_test)

0.25650264122103505

In [37]:
linreg.coef_

array([ 5.15622176e-03,  3.03760668e-06, -6.96127397e-01,  5.06622313e-01,
        3.22358928e-01])

In [38]:
linreg.coef_.round(3)

array([ 0.005,  0.   , -0.696,  0.507,  0.322])

In [39]:
mean_absolute_error(y_pred=linreg.predict(X_test), y_true=y_test)

0.7079692829199522

In [40]:
yc_train = (y_train > y_train.mean())*1
yc_test = (y_test > y_train.mean())*1

In [41]:
y_train

2590   6.40
519    6.50
1721   6.70
1861   6.10
936    6.90
       ... 
1033   4.10
3264   7.90
1653   6.70
2607   6.90
2732   7.10
Name: imdb_score, Length: 3126, dtype: float64

In [42]:
yc_train

2590    0
519     1
1721    1
1861    0
936     1
       ..
1033    0
3264    1
1653    1
2607    1
2732    1
Name: imdb_score, Length: 3126, dtype: int64

In [43]:
yc_train.value_counts(1)

1   0.56
0   0.44
Name: imdb_score, dtype: float64

In [44]:
yc_test.value_counts(1)

1   0.51
0   0.49
Name: imdb_score, dtype: float64

In [45]:
logreg = LogisticRegression()

In [46]:
logreg.fit(X_train, yc_train)

LogisticRegression()

In [47]:
roc_auc_score(y_score=logreg.predict_proba(X_train)[:,1],y_true=yc_train )

0.6838008575077541

In [48]:
roc_auc_score(y_score=logreg.predict_proba(X_test)[:,1],y_true=yc_test )


0.6653099596305923

In [49]:
X_train[ls_best]

Unnamed: 0,duration,num_voted_users,color_Color,genre_1_Biography,genre_2_Drama
2590,107.00,38191.00,1,0,0
519,114.00,87950.00,1,0,1
1721,89.00,29843.00,1,0,0
1861,90.00,86890.00,1,0,0
936,130.00,29932.00,1,0,0
...,...,...,...,...,...
1033,95.00,4875.00,1,0,0
3264,102.00,279093.00,1,0,0
1653,106.00,271592.00,1,0,0
2607,81.00,52055.00,1,0,0
