In [1]:
import pandas as pd

pd.set_option('display.max_rows', None)     #Show all lines
pd.set_option('display.max_columns', None)  #Show all columns

### Load data

In [2]:
imdb_movies = pd.read_csv('IMDb movies.csv')
imdb_names = pd.read_csv('IMDb names.csv')
imdb_ratings = pd.read_csv('IMDb ratings.csv')
imdb_ppals = pd.read_csv('IMDb title_principals.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


----
## Movies
----
----

### Drop columns

In [3]:
X = imdb_movies.drop(
    ['original_title', 'date_published', 'usa_gross_income',
    'worlwide_gross_income', 'budget', 'metascore'],
    axis=1
)

### Deal with categorical columns

In [4]:
categorical_columns = X.select_dtypes(include=['object'])
categorical_columns.head()

# Transform string years into integer
X['year'] = X['year'].apply(lambda x: int(x.split(' ')[-1]) if not isinstance(x, int) else x)

In [5]:
# print(len([x for x in X['metascore'] if isinstance(x, float) and math.isnan(x)]))

----
## Names
----
----
(possibly not used, only useful if actual names are needed)

### Drop columns

In [6]:
X_names = imdb_names[['imdb_name_id', 'name']]
X_names.head()

Unnamed: 0,imdb_name_id,name
0,nm0000001,Fred Astaire
1,nm0000002,Lauren Bacall
2,nm0000003,Brigitte Bardot
3,nm0000004,John Belushi
4,nm0000005,Ingmar Bergman


----
## Title_principals
----
----
(possibly not used, only useful if principals and movies need to be related)

In [7]:
X_ppals = imdb_ppals[['imdb_title_id', 'imdb_name_id', 'category']]
X_ppals.head()

Unnamed: 0,imdb_title_id,imdb_name_id,category
0,tt0000009,nm0063086,actress
1,tt0000009,nm0183823,actor
2,tt0000009,nm1309758,actor
3,tt0000009,nm0085156,director
4,tt0000574,nm0846887,actress


----
## Ratings
----
----

In [8]:
X_ratings = imdb_ratings[['imdb_title_id', 'weighted_average_vote', 'mean_vote', 'median_vote']]
# X_ratings = imdb_ratings  # all other votes may be useful for "because of you"
X_ratings.head()

Unnamed: 0,imdb_title_id,weighted_average_vote,mean_vote,median_vote
0,tt0000009,5.9,5.9,6.0
1,tt0000574,6.1,6.3,6.0
2,tt0001892,5.8,6.0,6.0
3,tt0002101,5.2,5.3,5.0
4,tt0002130,7.0,6.9,7.0


----
## Merge movies and ratings
----
----

In [9]:
X = X.merge(X_ratings, on='imdb_title_id')
X.head()

Unnamed: 0,imdb_title_id,title,year,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,reviews_from_users,reviews_from_critics,weighted_average_vote,mean_vote,median_vote
0,tt0000009,Miss Jerry,1894,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,1.0,2.0,5.9,5.9,6.0
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,7.0,7.0,6.1,6.3,6.0
2,tt0001892,Den sorte drøm,1911,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,5.0,2.0,5.8,6.0,6.0
3,tt0002101,Cleopatra,1912,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,25.0,3.0,5.2,5.3,5.0
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,31.0,14.0,7.0,6.9,7.0


### Recheck correlations

In [10]:
# 1 means same signal; 0 means no correlation at all
X_without_votes = X.drop(['mean_vote', 'median_vote', 'avg_vote'], axis=1)
corr_table = X_without_votes.corr(method='pearson').abs()

# Get the highest values != 1
corr_coef = corr_table.unstack()
sorted_coef = corr_coef[corr_coef < 1].sort_values(kind='quicksort', ascending=False)
print(sorted_coef)

votes                  reviews_from_users       0.766237
reviews_from_users     votes                    0.766237
reviews_from_critics   votes                    0.671635
votes                  reviews_from_critics     0.671635
reviews_from_critics   reviews_from_users       0.671634
reviews_from_users     reviews_from_critics     0.671634
weighted_average_vote  duration                 0.242432
duration               weighted_average_vote    0.242432
weighted_average_vote  year                     0.217559
year                   weighted_average_vote    0.217559
weighted_average_vote  reviews_from_critics     0.200526
reviews_from_critics   weighted_average_vote    0.200526
year                   duration                 0.172810
duration               year                     0.172810
weighted_average_vote  votes                    0.166972
votes                  weighted_average_vote    0.166972
weighted_average_vote  reviews_from_users       0.138185
reviews_from_users     weighted

### Split (possible)

In [None]:
from sklearn.model_selection import train_test_split

# Divide data into training and validation subsets
# X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=45)

# X_train.head()