# Logistic Model

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
# Read in cleaned movies DataFrame
movies = pd.read_pickle('Pickles/movies_cleaned.pkl')

In [3]:
all_genres = []
for i, row in movies.iterrows():
    for genre in row.genres_list:
        if genre not in all_genres:
            all_genres.append(genre)

In [4]:
# 80% train, 20% test (41870 total with a genre label)
genre_train = movies[movies.num_genres!=0].iloc[:33496]
genre_test = movies[movies.num_genres!=0].iloc[-8374:]

In [5]:
def check_model(row, model_name):
    if row[model_name+'_pred'] in row.genres_list:
        return True
    else:
        return False

In [6]:
text = []
labels = []
for i,row in genre_train.iterrows():
    for g in row.genres_list:
        labels.append(g)
        text.append(row.overview)

### Logistic Regression

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

#### Baseline

In [8]:
logistic_clf1 = make_pipeline(TfidfVectorizer(),LogisticRegression(multi_class='ovr'))
logistic_clf1.fit(text,labels)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_i...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [9]:
genre_test['logistic_clf1_pred'] = logistic_clf1.predict(genre_test.overview)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [10]:
genre_test['logistic_clf1_correct'] = genre_test.apply(lambda row: check_model(row,'logistic_clf1'),axis=1)
genre_test.logistic_clf1_correct.value_counts()
print("Model Accuracy: ",len(genre_test[genre_test.logistic_clf1_correct==True])/len(genre_test))

Model Accuracy:  0.6563171721996657


In [11]:
clf1_counts = genre_test.groupby(['logistic_clf1_pred','logistic_clf1_correct']).title.count()
clf1_counts.groupby(level=0).apply(lambda x: x/float(np.sum(x))).unstack()

logistic_clf1_correct,False,True
logistic_clf1_pred,Unnamed: 1_level_1,Unnamed: 2_level_1
action,0.292035,0.707965
adventure,0.4375,0.5625
animation,0.027027,0.972973
comedy,0.334682,0.665318
crime,0.4,0.6
documentary,0.121481,0.878519
drama,0.404734,0.595266
family,0.235294,0.764706
fantasy,0.361111,0.638889
history,1.0,


### Using 1-3 n-grams

In [12]:
# logistic_clf2 = make_pipeline(TfidfVectorizer(ngram_range=(1,3)),LogisticRegression(multi_class='ovr'))
# logistic_clf2.fit(text,labels)

In [13]:
# genre_test['logistic_clf2_pred'] = logistic_clf2.predict(genre_test.overview)

In [14]:
# genre_test['logistic_clf2_correct'] = genre_test.apply(lambda row: check_model(row,'logistic_clf2'),axis=1)
# genre_test.logistic_clf2_correct.value_counts()
# print("Model Accuracy: ",len(genre_test[genre_test.logistic_clf2_correct==True])/len(genre_test))

### Predict and Export

In [15]:
movies_en = pd.read_pickle('Pickles/movies_reduced.pkl')

In [16]:
movies_en['predicted_genre'] = logistic_clf1.predict(movies_en.overview)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [17]:
movies_en.columns

Index(['title', 'overview', 'year', 'genres_list', 'belongs_to_collection',
       'budget', 'original_language', 'popularity', 'revenue', 'runtime',
       'tagline', 'vote_average', 'vote_count', 'num_genres',
       'production_companies_list', 'production_countries_list', 'month',
       'day', 'genre_action', 'genre_adventure', 'genre_animation',
       'genre_comedy', 'genre_crime', 'genre_documentary', 'genre_drama',
       'genre_family', 'genre_fantasy', 'genre_foreign', 'genre_history',
       'genre_horror', 'genre_music', 'genre_mystery', 'genre_romance',
       'genre_science_fiction', 'genre_thriller', 'genre_tv_movie',
       'genre_war', 'genre_western', 'cast_list', 'director_list',
       'keywords_list', 'weighted_rating', 'predicted_genre'],
      dtype='object')

In [18]:
for i,row in movies_en.iterrows():
    if row.num_genres == 0:
        row.genres_list.append(row.predicted_genre)

In [19]:
all_genres = []
for i, row in movies_en.iterrows():
    for genre in row.genres_list:
        if genre not in all_genres:
            all_genres.append(genre)

In [20]:
all_genres.sort()
all_genres

['action',
 'adventure',
 'animation',
 'comedy',
 'crime',
 'documentary',
 'drama',
 'family',
 'fantasy',
 'foreign',
 'history',
 'horror',
 'music',
 'mystery',
 'romance',
 'science_fiction',
 'thriller',
 'tv_movie',
 'war',
 'western']

In [21]:
# re_generate dummy columns
def check_genre(row,genre_target):
    for genre_iter in row.genres_list:
        if genre_iter == genre_target:
            return 1
    return 0

for genre in all_genres:
    movies_en['genre_'+genre] = movies_en.apply(lambda row: check_genre(row,genre),axis=1)

In [24]:
movies_en.to_pickle('Pickles/movies_reduced_predGenres.pkl')