# Key Objectives
1. Clean and Normalize 
2. Create a Count Vectorizer: remove stop words and set the minimum df to 10%
3. Create a Naive Bayes model and a Logistic Regression model for prediction
4. Compare their accuracy scores and classification reports 
5. Using the better performing model, return the top 5 movies that the model predicts are most likely directed by a woman

In [12]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import text_preprocessing
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
df = pd.read_csv('Data/movie_reviews.csv')

In [7]:
df.head(2)

Unnamed: 0,movie_title,rating,genre,in_theaters_date,movie_info,directors,director_gender,tomatometer_rating,audience_rating,critics_consensus
0,A Dog's Journey,PG,"Drama, Kids & Family",5/17/19,Bailey (voiced again by Josh Gad) is living th...,Gail Mancuso,female,50,92,A Dog's Journey is as sentimental as one might...
1,A Dog's Way Home,PG,Drama,1/11/19,"Separated from her owner, a dog sets off on an...",Charles Martin Smith,male,60,71,A Dog's Way Home may not quite be a family-fri...


In [8]:
df['movie_info_clean'] = text_preprocessing.clean_normalize(df['movie_info'])

In [11]:
df.head(2)

Unnamed: 0,movie_title,rating,genre,in_theaters_date,movie_info,directors,director_gender,tomatometer_rating,audience_rating,critics_consensus,movie_info_clean
0,A Dog's Journey,PG,"Drama, Kids & Family",5/17/19,Bailey (voiced again by Josh Gad) is living th...,Gail Mancuso,female,50,92,A Dog's Journey is as sentimental as one might...,Bailey voice Josh Gad live good life Michigan ...
1,A Dog's Way Home,PG,Drama,1/11/19,"Separated from her owner, a dog sets off on an...",Charles Martin Smith,male,60,71,A Dog's Way Home may not quite be a family-fri...,separate owner dog set 400mile journey safety ...


In [13]:
cv = CountVectorizer(stop_words='english', min_df=0.1)
dtm =cv.fit_transform(df['movie_info_clean'])
dtm_df = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out())
dtm_df

Unnamed: 0,begin,discover,family,film,follow,force,friend,home,leave,life,...,man,new,set,star,story,turn,woman,world,year,young
0,1,0,0,0,0,0,1,0,1,3,...,0,2,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,0,1,...,0,1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
162,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
163,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,1
164,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,0


In [14]:
y = df['director_gender']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(dtm_df, y, test_size=.2, random_state=42)

In [16]:
# Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7941176470588235
              precision    recall  f1-score   support

      female       0.00      0.00      0.00         5
        male       0.84      0.93      0.89        29

    accuracy                           0.79        34
   macro avg       0.42      0.47      0.44        34
weighted avg       0.72      0.79      0.76        34



In [18]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Accuracy: 0.7941176470588235
              precision    recall  f1-score   support

      female       0.33      0.40      0.36         5
        male       0.89      0.86      0.88        29

    accuracy                           0.79        34
   macro avg       0.61      0.63      0.62        34
weighted avg       0.81      0.79      0.80        34



In [20]:
# compare models
model.predict_proba(dtm_df)[:,0]

array([0.55232401, 0.07327573, 0.18025395, 0.42883046, 0.36510223,
       0.20454545, 0.32619209, 0.02676067, 0.04665098, 0.19210722,
       0.17790179, 0.46759192, 0.19413093, 0.05010811, 0.13778194,
       0.14930556, 0.38464986, 0.07596549, 0.14019258, 0.40736262,
       0.29725855, 0.22648356, 0.25287232, 0.41829938, 0.19413093,
       0.26446179, 0.22331808, 0.78319897, 0.29054054, 0.01697598,
       0.026078  , 0.04996359, 0.03012186, 0.08344048, 0.29054054,
       0.06424266, 0.05656657, 0.13102303, 0.7088059 , 0.20454545,
       0.03012186, 0.3605702 , 0.20454545, 0.05609155, 0.42390359,
       0.20454545, 0.34822175, 0.0039424 , 0.28995155, 0.27991356,
       0.22089041, 0.21845463, 0.11444763, 0.39938921, 0.09001851,
       0.75400861, 0.00364618, 0.01453324, 0.03427748, 0.22089041,
       0.19466118, 0.3283173 , 0.27164819, 0.1139709 , 0.34062139,
       0.06787267, 0.03012186, 0.0306376 , 0.19413093, 0.63286855,
       0.59437474, 0.41292431, 0.25744834, 0.02366409, 0.01599

In [22]:
df['female_director_prediction']=model_lr.predict_proba(dtm_df)[:,0] # [:, 0] to see just female directed predictions
df.head()

Unnamed: 0,movie_title,rating,genre,in_theaters_date,movie_info,directors,director_gender,tomatometer_rating,audience_rating,critics_consensus,movie_info_clean,female_director_prediction
0,A Dog's Journey,PG,"Drama, Kids & Family",5/17/19,Bailey (voiced again by Josh Gad) is living th...,Gail Mancuso,female,50,92,A Dog's Journey is as sentimental as one might...,Bailey voice Josh Gad live good life Michigan ...,0.54986
1,A Dog's Way Home,PG,Drama,1/11/19,"Separated from her owner, a dog sets off on an...",Charles Martin Smith,male,60,71,A Dog's Way Home may not quite be a family-fri...,separate owner dog set 400mile journey safety ...,0.104747
2,A Tuba to Cuba,NR,"Documentary, Musical & Performing Arts",2/15/19,The leader of New Orleans' famed Preservation ...,"Danny Clinch, T.G. Herrington",male,100,82,,leader New Orleans fame Preservation Hall Jazz...,0.09402
3,A Vigilante,R,Drama,3/29/19,"A once abused woman, Sadie (Olivia Wilde), dev...",Sarah Daggar-Nickson,female,92,50,Led by Olivia Wilde's fearless performance and...,abuse woman Sadie Olivia Wilde devote rid vict...,0.347006
4,After,PG-13,"Drama, Romance",4/12/19,Based on Anna Todd's best-selling novel which ...,Jenny Gage,female,17,72,"Tepid and tired, After's fun flourishes are le...",base Anna Todds bestselle novel publishing sen...,0.378965


In [25]:
(df[['movie_title', 'movie_info', 'directors', 'director_gender', 'female_director_prediction']]).sort_values(by = 'female_director_prediction', ascending=False).head(10)

Unnamed: 0,movie_title,movie_info,directors,director_gender,female_director_prediction
55,Greta,"A sweet, naïve young woman trying to make it o...",Neil Jordan,male,0.829956
140,The Secret Life of Pets 2,THE SECRET LIFE OF PETS 2 will follow summer 2...,"Chris Renaud, Jonathan Del Val",male,0.701463
27,Charlie Says,Three young women were sentenced to death for ...,Mary Harron,female,0.683698
76,Mary Magdalene,Set in the Holy Land in the first century C.E....,Garth Davis,male,0.643084
38,Egg,An unflinching comedy about why women choose m...,Marianna Palka,female,0.638023
16,Ash Is Purest White,A tragicomedy initially set in the jianghu-cri...,Zhangke Jia,male,0.616894
70,Little Woods,"Little Woods, North Dakota, a fracking boomtow...",Nia DaCosta,female,0.57904
69,Little,Marsai Martin (TV's Black-ish) stars in and ex...,Tina Gordon Chism,female,0.559209
164,Wine Country,In honor of Rebecca (Rachel Dratch)'s 50th bir...,Amy Poehler,female,0.558396
0,A Dog's Journey,Bailey (voiced again by Josh Gad) is living th...,Gail Mancuso,female,0.54986
