In [58]:
import pandas as pd
import numpy as np
import re

# 1. Read Data

In [266]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [98]:
train.shape

(3000, 23)

In [99]:
test.shape

(4398, 22)

# 2.Clean Data

There seems to be way more test cases than training cases ideally we'd want it the other way around. Lets clean up these data sets a bit.

In [100]:
train.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,original_title,overview,popularity,...,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew,revenue
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,tt2637294,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,...,2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de...",12314651
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0368933,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,...,8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de...",95149435
2,3,,3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.29999,...,10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de...",13092000
3,4,,1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,tt1821480,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,...,3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de...",16000000
4,5,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,tt1380152,ko,마린보이,Marine Boy is the story of a former national s...,1.14807,...,2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,,"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de...",3923970


The following columns can be dropped from our two datasets: belongs_to_collection, homepage, imdb_id, original_title, overview, poster_path, status, tagline.

In [265]:
drop_cols = ['belongs_to_collection','homepage','imdb_id','original_title','overview','poster_path','status','tagline','Keywords','crew','production_companies','production_countries']

In [267]:
for col in drop_cols:
    train.drop(col, axis=1, inplace = True)
    test.drop(col, axis=1, inplace = True)

In [268]:
train.shape

(3000, 11)

In [269]:
test.shape

(4398, 10)

Now lets check null values and clean up the columns individually

In [272]:
train.isnull().sum()

id                    0
budget                0
genres                7
original_language     0
popularity            0
release_date          0
runtime               2
spoken_languages     20
title                 0
cast                 13
revenue               0
dtype: int64

In [273]:
test.isnull().sum()

id                    0
budget                0
genres               16
original_language     0
popularity            0
release_date          1
runtime               4
spoken_languages     42
title                 3
cast                 13
dtype: int64

## 2.1 genres
The genres column contains a dictionary for each entry. Lets first clean up the column to only have a string of genres. The dictionary contains each genre the film is apart of, some films are apart of more than one genre.

In [274]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4398 entries, 0 to 4397
Data columns (total 10 columns):
id                   4398 non-null int64
budget               4398 non-null int64
genres               4382 non-null object
original_language    4398 non-null object
popularity           4398 non-null float64
release_date         4397 non-null object
runtime              4394 non-null float64
spoken_languages     4356 non-null object
title                4395 non-null object
cast                 4385 non-null object
dtypes: float64(2), int64(2), object(6)
memory usage: 343.7+ KB


In [275]:
#function that cleans up the genre data; essentially it just extracts the values from the value-key pairs
def clean_genre(df):
    clean_list = []
    all_genres = df.genres
    for genre in all_genres:
        if genre is not np.nan:
            names = re.findall("name': '(.*?)'}", genre)
            clean_list.append(names)
        if genre is np.nan:
            clean_list.append('NULL')
    return clean_list

In [276]:
clean_genrelist = clean_genre(train)

In [277]:
train.genres = clean_genrelist

In [278]:
train.genres.head(5)

0                            [Comedy]
1    [Comedy, Drama, Family, Romance]
2                             [Drama]
3                   [Thriller, Drama]
4                  [Action, Thriller]
Name: genres, dtype: object

Now do the same for test dataset

In [280]:
clean_genrelist = clean_genre(test)

In [281]:
test.genres = clean_genrelist

In [282]:
test.genres.head(5)

0    [Adventure, Animation, Family, Fantasy]
1                  [Horror, Science Fiction]
2                          [Comedy, Romance]
3                      [Drama, War, Mystery]
4                     [History, Documentary]
Name: genres, dtype: object

Now that we've cleaned up the columns, lets do something about the **NULL** values. We can drop the train NULL values since there are only 7/3000 of them:

In [289]:
train.genres[469]

['Adventure', 'Drama', 'Action', 'Romance', 'Family']

In [284]:
for x in range (0,len(train)):
    if train.loc[x,'genres'] == 'NULL':
        train.drop(x, axis=0, inplace = True)

KeyError: 'the label [470] is not in the [index]'

In [130]:
train.shape

(2993, 15)

The 7 movies have been dropped from the train dataset, now lets manually edit the test dataset null genres

In [195]:
for x in range (0,len(test)):
    if test.loc[x,'genres'] == 'NULL':
        print(test.loc[x,:])

id                                                                   4616
budget                                                            2000000
genres                                                               NULL
original_language                                                      en
popularity                                                       0.995285
production_companies                                                  NaN
production_countries             [{'iso_3166_1': 'RU', 'name': 'Russia'}]
release_date                                                      1/21/10
runtime                                                                85
spoken_languages                 [{'iso_639_1': 'ru', 'name': 'Pусский'}]
title                                          Nasha Russia: Yaytsa sudby
Keywords                                                              NaN
cast                    [{'cast_id': 7, 'character': '–±—Ä–∏–≥–∞–¥–∏—Ä...
crew                    [{'credit_id':

At Index **73** is the movie 'Dangerously Close', it was an Action Thriller

In [170]:
#so we dont get error messages when making the updates: https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None

In [171]:
mylist = list(('Action','Thriller'))

In [172]:
test.genres[73] = mylist

At Index **793** is the movie 'Table for Five', it was a Drama

In [175]:
#function to fill in NULL genres for test df
def fill_genre(genres, index):
    mylist = list((genres))
    test.genres[index] = mylist
    print(test.genres[index])
    

In [174]:
fill_genre(['Action'],793)

['Action']


At Index **910** is the movie 'Valentino', it was an adventure drama

In [177]:
fill_genre(['Adventure','Drama'],910)

['Adventure', 'Drama']


At Index **1221** is the movie 'Street Knight', it was an Action Adventure

In [178]:
fill_genre(['Action','Adventure'],1221)

['Action', 'Adventure']


At Index **1442** is the movie 'My Son', it was a Drama

In [179]:
fill_genre(['Drama'],1442)

['Drama']


At Index **1615** is the movie 'Nasha Russia', it was a Comedy

In [196]:
fill_genre(['Comedy'],1615)

['Comedy']


At Index **1964** is the movie 'Nasha Russia', it was a Crime Action film.

In [181]:
fill_genre(['Crime','Action'],1964)

['Crime', 'Action']


I cannot find the movie at index **2062**, so it will remain blank.

At Index **2118** is the movie 'Duniyadari', it was a Comedy Drama film.

In [190]:
fill_genre(['Comedy','Drama'],2118)

['Comedy', 'Drama']


At Index **2213** is the movie 'Praying with Lior', it was a Documentary film.

In [191]:
fill_genre(['Documentary'],2213)

['Documentary']


At Index **2251** is the movie 'Teddy Bears' Picnic', it was a Comedy film.

In [192]:
fill_genre(['Comedy'],2251)

['Comedy']


At Index **2519** is the movie 'Glukhar v kino', it was a Comedy film.

In [193]:
fill_genre(['Comedy'],2519)

['Comedy']


At Index **3449** is the movie 'Lucky Lady', it was a Drama Comedy film.


In [198]:
fill_genre(['Drama','Comedy'],3449)

['Drama', 'Comedy']


At Index **3485** is the movie 'Death of a Dynasty', it was a Comedy film.

In [200]:
fill_genre(['Comedy'],3485)

['Comedy']


At index **3564** is the movie 'Fahrenheit 9/11', it was a documentary

In [202]:
fill_genre(['Documentary'],3564)

['Documentary']


At index **3817** is the movie 'Miesten välisiä keskusteluja', it was a Drama

In [204]:
fill_genre(['Drama'],3817)

['Drama']


Check that only one NULL from indx **2062** remains:

In [205]:
for x in range (0,len(test)):
    if test.loc[x,'genres'] == 'NULL':
        print(test.loc[x,:])

id                                                                   5063
budget                                                                  0
genres                                                               NULL
original_language                                                      ro
popularity                                                        0.03856
production_companies    [{'name': 'Media Pro Pictures', 'id': 3244}, {...
production_countries            [{'iso_3166_1': 'RO', 'name': 'Romania'}]
release_date                                                     10/21/05
runtime                                                                95
spoken_languages                  [{'iso_639_1': 'ro', 'name': 'Română'}]
title                                                                  15
Keywords                                                              NaN
cast                    [{'cast_id': 1, 'character': 'Hilde', 'credit_...
crew                    [{'credit_id':

Lets give the movie at **2062** the mode genre

In [234]:
#get string version of genres
genre_strings = str(test.genres)

In [242]:
test.genres = genre_strings

In [243]:
mode = test.genres.mode()

In [250]:
mode

0    0       0              [Adventure, Animation, ...
dtype: object

In [251]:
fill_genre(['Adventure', 'Animation'],2062)

['Adventure', 'Animation']


In [252]:
#check that no more NULL genres exist
for x in range (0,len(test)):
    if test.loc[x,'genres'] == 'NULL':
        print(test.loc[x,:])

In [271]:
test.isnull().sum()

id                    0
budget                0
genres               16
original_language     0
popularity            0
release_date          1
runtime               4
spoken_languages     42
title                 3
cast                 13
dtype: int64

In [270]:
train.isnull().sum()

id                    0
budget                0
genres                7
original_language     0
popularity            0
release_date          0
runtime               2
spoken_languages     20
title                 0
cast                 13
revenue               0
dtype: int64