In [38]:
from glob import glob
import pandas as pd
import numpy as np
import os
import json
from collections import defaultdict

from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# 1. Data pre-processing

In [2]:
# df_movies = pd.read_csv('dataset/tmdb_5000_movies.csv')
# df_movies = df_movies.dropna(subset=['overview', 'release_date', 'runtime'])
# df_movies.isnull().sum()
# df_movies.to_csv("dataset/tmdb_5000_movies_nonull.csv")

### Load two datasets and merge them by movie_id

In [3]:
df = pd.read_csv('dataset/tmdb_5000_movies_nonull.csv')
# df_movies.loc[:,['id', 'keywords', 'genres', 'title']]
df_credits = pd.read_csv('dataset/tmdb_5000_credits.csv')
credits_sub = df_credits.loc[:, ['movie_id', 'cast']].rename(columns={
                                                             'movie_id': 'id'})

df = df[['id', 'budget', 'genres', 'keywords', 'original_language',
         'overview', 'popularity', 'production_companies',
         'production_countries', 'release_date', 'revenue', 'runtime',
         'spoken_languages', 'tagline', 'title', 'vote_average', 'vote_count']]
df = df.merge(credits_sub)

# df.sort_values(by='release_date', ascending=False).head(10)

### Drop movies with no genres information

In [4]:
# drop movies with no genre info
for i, row in df.iterrows():
    if row['genres'] == '[]':
        df.drop(i, inplace=True)
df = df.dropna()

### Column transformation

**Steps:**  
+ Filter columns with specific filtering values
+ Convert json-format cells to list
+ Extract cast gender and calculate the proportion of female cast and male cast for each movie

In [5]:
def convert_list(cell):
    """convert the json format to a list of categories"""
    kw_list = []
    for kw in json.loads(cell):
        kw_list.append(kw['name'])
    return kw_list


def larger_n(col, n):
    """filter the column"""
    keywords = defaultdict(int)
    for row in df[col]:
        row = json.loads(row)
        for entry in row:
            keywords[entry['name']] += 1
    kw_cnt = sorted(keywords.items(), key=lambda x: -x[1])
    return [kw[0] for kw in kw_cnt if kw[1] >= n]


def extract_gender(cell):
    """Extract cast gender"""
    female = 0
    male = 0
    for item in json.loads(cell):
        if item['gender'] == 1:
            female += 1
        elif item['gender'] == 2:
            male += 1
        else:
            continue
    return female, male


def concat_names(cell):
    """Concatenate first names and last names"""
    names = []
    for name in cell:
        names.append(name.replace(' ', ''))
    return names


def list2str(cell):
    """Convert list to string"""
    return ' '.join(cell)


def transform_cols(df, cols_to_transform):
    """Transform columns of a dataframe.
    cols_to_transform should be a dict(col_name: filter value n)
    """
    for col_name in cols_to_transform.keys():
        larger_col = larger_n(col_name, cols_to_transform[col_name])
        if col_name == 'cast':
            gen = df[col_name].apply(extract_gender)
            df['female_pct'] = gen.apply(lambda x: x[0]/(x[0]+x[1]+0.001))
            df['male_pct'] = gen.apply(lambda x: x[1]/(x[0]+x[1]+0.001))

        df[col_name] = df[col_name].apply(convert_list)\
            .apply(lambda cell: [kw for kw in cell if kw in larger_col])
    return df

In [6]:
cols_to_transform = {'keywords': 30,
                     'genres': 0,
                     'production_companies': 5,
                     'production_countries': 3,
                     'spoken_languages': 10,
                     'cast': 2}

df_movies = transform_cols(df, cols_to_transform)

In [7]:
df_movies = df_movies.reset_index(drop=True)
df_movies.head(10)

Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast,female_pct,male_pct
0,19995,237000000,"[Action, Adventure, Fantasy, Science Fiction]","[future, alien, soldier, 3d]",en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]",2009-12-10,2787965087,162.0,"[English, Español]",Enter the World of Pandora.,Avatar,7.2,11800,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",0.374988,0.62498
1,285,300000000,"[Adventure, Fantasy, Action]","[love of one's life, aftercreditsstinger]",en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films]",[United States of America],2007-05-19,961000000,169.0,[English],"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",0.166661,0.833306
2,206647,245000000,"[Action, Adventure, Crime]","[spy, based on novel, sequel]",en,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq]","[United Kingdom, United States of America]",2015-10-26,880674609,148.0,"[Français, English, Español, Italiano, Deutsch]",A Plan No One Escapes,Spectre,6.3,4466,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",0.191174,0.808812
3,49026,250000000,"[Action, Crime, Drama, Thriller]","[terrorist, secret identity, superhero]",en,Following the death of District Attorney Harve...,112.31295,"[Legendary Pictures, Warner Bros., DC Entertai...",[United States of America],2012-07-16,1084939099,165.0,[English],The Legend Ends,The Dark Knight Rises,7.6,9106,"[Christian Bale, Michael Caine, Gary Oldman, A...",0.130951,0.869037
4,49529,260000000,"[Action, Adventure, Science Fiction]","[based on novel, alien, escape, 3d]",en,"John Carter is a war-weary, former military ca...",43.926995,[Walt Disney Pictures],[United States of America],2012-03-07,284139100,132.0,[English],"Lost in our world, found in another.",John Carter,6.1,2124,"[Taylor Kitsch, Lynn Collins, Samantha Morton,...",0.23999,0.75997
5,559,258000000,"[Fantasy, Action, Adventure]","[love of one's life, marvel comic, sequel, sup...",en,The seemingly invincible Spider-Man goes up ag...,115.699814,"[Columbia Pictures, Laura Ziskin Productions, ...",[United States of America],2007-05-01,890871626,139.0,"[English, Français]",The battle within.,Spider-Man 3,5.9,3576,"[Tobey Maguire, Kirsten Dunst, James Franco, T...",0.629206,0.370782
6,38757,260000000,"[Animation, Family]","[magic, musical, duringcreditsstinger]",en,When the kingdom's most wanted-and most charmi...,48.681969,"[Walt Disney Pictures, Walt Disney Animation S...",[United States of America],2010-11-24,591794936,100.0,[English],They're taking adventure to new lengths.,Tangled,7.4,3330,"[Zachary Levi, Mandy Moore, Donna Murphy, Ron ...",0.19998,0.79992
7,99861,280000000,"[Action, Adventure, Science Fiction]","[marvel comic, sequel, superhero, based on com...",en,When Tony Stark tries to jumpstart a dormant p...,134.279229,"[Marvel Studios, Revolution Sun Studios]",[United States of America],2015-04-22,1405403694,141.0,[English],A New Age Has Come.,Avengers: Age of Ultron,7.3,6767,"[Robert Downey Jr., Chris Hemsworth, Mark Ruff...",0.333323,0.666646
8,767,250000000,"[Adventure, Fantasy, Family]","[witch, magic]",en,"As Harry begins his sixth year at Hogwarts, he...",98.885637,"[Warner Bros., Heyday Films]","[United Kingdom, United States of America]",2009-07-07,933959197,153.0,[English],Dark Secrets Revealed,Harry Potter and the Half-Blood Prince,7.4,5293,"[Daniel Radcliffe, Rupert Grint, Emma Watson, ...",0.414624,0.585352
9,209112,250000000,"[Action, Adventure, Fantasy]","[superhero, based on comic book, revenge]",en,Fearing the actions of a god-like Super Hero l...,155.790452,"[DC Comics, Atlas Entertainment, Warner Bros.,...",[United States of America],2016-03-23,873260194,151.0,[English],Justice or revenge,Batman v Superman: Dawn of Justice,5.7,7004,"[Ben Affleck, Henry Cavill, Gal Gadot, Amy Ada...",0.322578,0.677414


# 2. Feature Engineering

### Runtime - drop 0 runtime

In [9]:
df_movies.loc[df_movies.runtime == 0]

Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast,female_pct,male_pct
2805,41894,0,"[Drama, Crime]",[],en,A drama based on the true story in which a bla...,0.397341,[],[],2010-02-01,0,0.0,[],No one changes the world alone.,Blood Done Sign My Name,6.0,5,"[Michael Rooker, Emily Alyn Lind, Omar Benson ...",0.42851,0.571347
3234,113406,0,"[Comedy, Drama]",[],en,"A self-centered, middle-aged pitchman for a po...",0.40703,[],[United States of America],2012-04-28,0,0.0,[English],Even Shakespeare didn't see this one coming.,Should've Been Romeo,0.0,0,"[Paul Ben-Victor, Ed Asner, Jordenn Thompson, ...",0.384586,0.615337
3633,51820,1500000,[Comedy],[independent film],en,A Beauty shop owner finds romance as she strug...,2.02817,[],[],2005-01-13,0,0.0,[English],Where you get more than just a hair cut!,The Salon,3.5,1,"[Dondre Whitfield, Kym Whitley, Monica Calhoun...",0.42851,0.571347
3674,107315,0,"[Thriller, Horror]",[],en,When Jack (Edward Furlong) is in danger of mis...,1.36514,[],[Canada],2011-10-22,0,0.0,[English],There's nothing scarier than a blank page.,Below Zero,4.4,12,"[Edward Furlong, Michael Berryman, Kristin Booth]",0.333222,0.666445
3684,202604,0,"[Horror, Documentary, Mystery]",[],en,Documentary following US film-maker Joe Marino...,0.447166,[],[],2013-05-29,0,0.0,"[Italiano, English]",The public were never meant to know,The Vatican Exorcisms,4.4,11,[],0.0,0.0
3824,285743,0,"[Music, Horror]","[musical, sequel]",en,The Devil's Carnival: Alleluia! is the second ...,0.674398,[],[United States of America],2016-03-29,0,0.0,[],Hell ain't got a prayer.,Alleluia! The Devil's Carnival,6.0,2,"[Terrance Zdunich, Paul Sorvino, Nivek Ogre, A...",0.363603,0.636306


Checking the six movies that have no runtime data, we can see that these six movies also lack revenue data, production companies, keywords, and all have very few vote counts. So we can think that removing these six rows will not have significant influence on our model.  

In [10]:
df_movies = df_movies.drop(df_movies.loc[df_movies.runtime == 0].index)

In [11]:
df_movies = df_movies.reset_index(drop=True)

### Overview -- TfidfVectorizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

df_movies = df_movies.loc[df_movies.overview.apply(lambda x: len(x) > 10)]
# df_movies = df_movies.reset_index(drop=True)

tfidf = TfidfVectorizer(stop_words='english', binary=True,
                        ngram_range=(1, 2), min_df=0.001)
overview_vec = tfidf.fit_transform(df_movies['overview'])

overview = pd.DataFrame(overview_vec.toarray(), columns=list(range(overview_vec.toarray().shape[1])))
overview_vec = overview_vec.todense()

### Cast names -- CountVectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# concatenate names and join the list of names to a string
df_movies['cast'] = df_movies['cast'].apply(concat_names).apply(list2str)

vectorizer = CountVectorizer(ngram_range=(1, 2))
cast_vect = vectorizer.fit_transform(df_movies['cast'])

cast = cast_vect.todense()

### Budget + Revenue -- deal with abnormal values

There are some movies in the data set has less than $100 budget, which obviously does not make any sense. So we need to deal with these abnormal values.

In [14]:
# number of movies with less than $100 budget
df_movies.loc[(df_movies.budget<100)].shape[0]

598

For these 107 movies with less than \\$100 budget and more than \\$10000 revenue, we use `revenue + overview + popularity + release_date` to predict the actual budget.

In [15]:
df_movies.loc[(df_movies.budget<100) & (df_movies.revenue>10000)].shape

(107, 20)

In [16]:
from sklearn import ensemble


def predict_budget1(dataset):
    """Predict budget of movies with less than $100 budget and more than $10000 revenue"""
    data_p = dataset[['revenue', 'popularity', 'budget']]
    data_p = pd.concat([data_p, overview], axis=1)

    x_train = data_p.loc[~(data_p.budget < 100) & (
        data_p.revenue > 10000)].drop('budget', 1)
    y_train = data_p.loc[~(data_p.budget < 100) &
                         (data_p.revenue > 10000)]['budget']
    x_test = data_p.loc[(data_p.budget < 100) & (
        data_p.revenue > 10000)].drop('budget', 1)

    rfr = ensemble.RandomForestRegressor(random_state=42)
    rfr.fit(x_train, y_train)

    y_test = rfr.predict(x_test)
    return y_test


df_movies.loc[(df_movies.budget < 100) & 
              (df_movies.revenue > 10000), 'budget'] = predict_budget1(df_movies)

  from numpy.core.umath_tests import inner1d


In [17]:
# check
df_movies.loc[(df_movies.budget<100) & (df_movies.revenue>10000)].shape

(0, 20)

For others whose revenue are below $10000, use `overview + popularity + release_date` to predict budget

In [18]:
df_movies.loc[(df_movies.budget<100)].shape

(491, 20)

In [19]:
def predict_budget2(dataset):
    """Predict budget of movies with less than $100 budget and less than $10000 revenue"""
    data_p = dataset[['popularity', 'budget']]
    data_p = pd.concat([data_p, overview], axis=1)

    x_train = data_p.loc[~(data_p.budget < 100)].drop('budget', 1)
    y_train = data_p.loc[~(data_p.budget < 100)]['budget']
    x_test = data_p.loc[(data_p.budget < 100) & (
        df_movies.revenue <= 10000)].drop('budget', 1)

    rfr = ensemble.RandomForestRegressor(random_state=42)
    rfr.fit(x_train, y_train)
#     print(x_test.shape)
    y_test = rfr.predict(x_test)
    return y_test


df_movies.loc[(df_movies.budget < 100) & (df_movies.revenue <=
                                          10000), 'budget'] = predict_budget2(df_movies)

In [20]:
df_movies.loc[(df_movies.budget<100)].shape

(0, 20)

### Keywords + tagline

Maybe we should not use tagline as a feature, two reasons:  
+ tagline is too short for each movie, and has almost no overlap across movies
+ Even, as a human, I cannot have a good guess of the movie genres reading only the tagline when I already know the movie.

In [21]:
# check all keywords and their count
kw_all = defaultdict(int)

for cell in df_movies.keywords:
    for kw in cell:
        kw_all[kw] += 1

print(len(kw_all))
sorted(kw_all.items(), key=lambda x: x[1], reverse=True)

124


[('duringcreditsstinger', 290),
 ('woman director', 231),
 ('independent film', 221),
 ('based on novel', 175),
 ('murder', 174),
 ('aftercreditsstinger', 160),
 ('violence', 141),
 ('dystopia', 138),
 ('revenge', 114),
 ('sport', 102),
 ('sex', 96),
 ('3d', 95),
 ('friendship', 93),
 ('teenager', 93),
 ('musical', 88),
 ('sequel', 86),
 ('biography', 84),
 ('love', 83),
 ('suspense', 79),
 ('new york', 78),
 ('los angeles', 76),
 ('police', 76),
 ('alien', 74),
 ('high school', 74),
 ('nudity', 67),
 ('superhero', 65),
 ('london england', 65),
 ('prison', 64),
 ('family', 63),
 ('dying and death', 61),
 ('drug', 61),
 ('father son relationship', 58),
 ('remake', 55),
 ('wedding', 54),
 ('serial killer', 54),
 ('daughter', 53),
 ('magic', 52),
 ('corruption', 52),
 ('kidnapping', 51),
 ('based on comic book', 50),
 ('friends', 49),
 ('suicide', 49),
 ('airplane', 49),
 ('escape', 48),
 ('world war ii', 48),
 ('survival', 48),
 ('fbi', 48),
 ('time travel', 47),
 ('hospital', 47),
 ('ma

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# join the list of keywords to a string
df_movies['keywords'] = df_movies['keywords'].apply(list2str)

vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))
keywords_vect = vectorizer.fit_transform(df_movies['keywords'])

keywords = keywords_vect.todense()

### Original_language + Spoken_language
1) "Original language" is the language of production.  
2) "Spoken language" includes both the language of production and other languages spoken in the movie, if exist.

give up this feature~~~~

### production_companies

In [37]:
prod_com = pd.DataFrame(mlb.fit_transform(
    df_movies['production_companies']), columns=mlb.classes_).add_prefix('prodComp_')

### production_countries

In [39]:
prod_coun = pd.DataFrame(mlb.fit_transform(
    df_movies['production_countries']), columns=mlb.classes_).add_prefix('prodComp_')

### release_date

In [66]:
# df_movies['year_bucket'] = 
years = pd.get_dummies(df_movies.apply(lambda row: str(row.release_date)[:3], axis=1))
years['before97'] = ww.iloc[:,:6].sum(axis=1)
years = years.iloc[:,6:]

### popularity + vote_average + vote_count

In [67]:
# df_movies.loc[df_movies['vote_average']!=0].vote_average.mean()
df_movies.loc[df_movies['vote_average']==0, 'vote_average'] = df_movies.loc[df_movies['vote_average']!=0].vote_average.mean()
df_movies.loc[df_movies['vote_count']==0, 'vote_count'] = df_movies.loc[df_movies['vote_count']!=0].vote_count.mean()


# Result-dataset

In [24]:
df_movies = df_movies.reset_index(drop=True)

In [25]:
df_movies.head(5)

Unnamed: 0,id,budget,genres,keywords,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,vote_average,vote_count,cast,female_pct,male_pct
0,19995,237000000.0,"[Action, Adventure, Fantasy, Science Fiction]",future alien soldier 3d,en,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[Ingenious Film Partners, Twentieth Century Fo...","[United States of America, United Kingdom]",2009-12-10,2787965087,162.0,"[English, Español]",Enter the World of Pandora.,Avatar,7.2,11800,SamWorthington ZoeSaldana SigourneyWeaver Step...,0.374988,0.62498
1,285,300000000.0,"[Adventure, Fantasy, Action]",love of one's life aftercreditsstinger,en,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[Walt Disney Pictures, Jerry Bruckheimer Films]",[United States of America],2007-05-19,961000000,169.0,[English],"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,JohnnyDepp OrlandoBloom KeiraKnightley Stellan...,0.166661,0.833306
2,206647,245000000.0,"[Action, Adventure, Crime]",spy based on novel sequel,en,A cryptic message from Bond’s past sends him o...,107.376788,"[Columbia Pictures, Danjaq]","[United Kingdom, United States of America]",2015-10-26,880674609,148.0,"[Français, English, Español, Italiano, Deutsch]",A Plan No One Escapes,Spectre,6.3,4466,DanielCraig ChristophWaltz LéaSeydoux RalphFie...,0.191174,0.808812
3,49026,250000000.0,"[Action, Crime, Drama, Thriller]",terrorist secret identity superhero,en,Following the death of District Attorney Harve...,112.31295,"[Legendary Pictures, Warner Bros., DC Entertai...",[United States of America],2012-07-16,1084939099,165.0,[English],The Legend Ends,The Dark Knight Rises,7.6,9106,ChristianBale MichaelCaine GaryOldman AnneHath...,0.130951,0.869037
4,49529,260000000.0,"[Action, Adventure, Science Fiction]",based on novel alien escape 3d,en,"John Carter is a war-weary, former military ca...",43.926995,[Walt Disney Pictures],[United States of America],2012-03-07,284139100,132.0,[English],"Lost in our world, found in another.",John Carter,6.1,2124,TaylorKitsch LynnCollins SamanthaMorton Willem...,0.23999,0.75997


Besides the modified dataframe, we also have three vectorized features: overview, cast names, and keywords.  


**Features** (so far ready to use):  
+ **numeric**: budget, revenue, female_pct, male_pct, runtime 
+ **text(vectorized)**: keywords, overview, cast(names)

**Labels**:  
20 genres, i.e., `genre_*`

In [26]:
# labels
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genres = mlb.fit_transform(df_movies.genres)
genres

array([[1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [27]:
# features
features = np.concatenate((overview_vec, cast, keywords,
                           np.asarray(
                               df_movies.loc[:, ['budget', 'revenue', 'female_pct', 'male_pct', 'runtime']])),
                          axis=1)

In [29]:
features

matrix([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         3.74988282e-01, 6.24980469e-01, 1.62000000e+02],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         1.66661111e-01, 8.33305556e-01, 1.69000000e+02],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         1.91173659e-01, 8.08811635e-01, 1.48000000e+02],
        ...,
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 9.99666778e-01, 7.70000000e+01],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 9.99666778e-01, 8.10000000e+01],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         7.49812547e-01, 2.49937516e-01, 8.50000000e+01]])

## Modelling

The result is very interesting.  

When I use only cast names and gender proportion, the prediction tends to give less genres for each movie than the ground truth (please refer to the `cast_vs_genres` notebook); however, using the eight features I mentioned above, the model tends to give more genres than the ground truth.

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(features, genres,
                                                                    test_size=0.2, random_state=42)

To save time, I didn't do cross-validation but only use the default `MultinomialNB()` algorithm.

In [None]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB

clf = OneVsRestClassifier(MultinomialNB())
clf.fit(train_data, train_labels)
predictions = clf.predict(test_data)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [None]:
accuracy = accuracy_score(y_pred=predictions, y_true=np.array(test_labels))
f1 = f1_score(y_pred=predictions, y_true=np.array(test_labels), average='micro')
precision = precision_score(y_pred=predictions, y_true=np.array(test_labels), average='micro')
recall = recall_score(y_pred=predictions, y_true=np.array(test_labels), average='micro')

print("Subset Accuracy: {}".format(accuracy)) # subset accuracy
print("Global F1 score: {}".format(f1))
print("Global Precision: {}".format(precision))
print("Global Recall: {}".format(recall))

In [None]:
print(classification_report(test_labels, predictions, target_names=mlb.classes_))

In [None]:
for i in range(predictions.shape[1]):
    fp = 0
    fn = 0
    tp = 0
    tn = 0
    
    truth_val = np.asarray(test_labels)[:, i]
    pred_val = np.asarray(predictions)[:, i]
    for vt, vp in zip(truth_val, pred_val):
        if vt==1 and vp==0: fn += 1
        elif vt == 0 and vp==1: fp += 1
        elif vt == 0 and vp==0: tn += 1
        elif vt == 1 and vp==1: tp += 1
    
    print('true1-pred0 rate of {}: {}'.format(mlb.classes_[i], fn/predictions.shape[0]))
    print('true0-pred1 rate of {}: {}'.format(mlb.classes_[i], fp/predictions.shape[0]))
    print('true0-pred0 rate of {}: {}'.format(mlb.classes_[i], tn/predictions.shape[0]))
    print('true1-pred1 rate of {}: {}\n'.format(mlb.classes_[i], tp/predictions.shape[0]))

## *get dummies