In [1]:
import numpy as np
import pandas as pd
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns 


In [2]:
data_movies = pd.read_csv('./IMDb_data/IMDb_movies.csv')

In [3]:
data_ratings = pd.read_csv('./IMDb_data/IMDb_ratings.csv')

In [5]:
data_movies.head(5)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [6]:
data_movies.dtypes

imdb_title_id             object
title                     object
original_title            object
year                       int64
date_published            object
genre                     object
duration                   int64
country                   object
language                  object
director                  object
writer                    object
production_company        object
actors                    object
description               object
avg_vote                 float64
votes                      int64
budget                    object
usa_gross_income          object
worlwide_gross_income     object
metascore                float64
reviews_from_users       float64
reviews_from_critics     float64
dtype: object

In [7]:
# Learning which columns have more than 50% missing values 
((data_movies.isna().sum())/len(data_movies)*100).round(2)

imdb_title_id             0.00
title                     0.00
original_title            0.00
year                      0.00
date_published            0.00
genre                     0.00
duration                  0.00
country                   0.05
language                  0.93
director                  0.09
writer                    1.84
production_company        5.32
actors                    0.08
description               2.99
avg_vote                  0.00
votes                     0.00
budget                   71.94
usa_gross_income         81.43
worlwide_gross_income    63.22
metascore                84.35
reviews_from_users        8.71
reviews_from_critics     13.52
dtype: float64

In [8]:
# Exploring the missing values of the USA gross income
# Mostly the missing values are from foreign movies
data_movies[(data_movies['usa_gross_income'].isnull())]

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,tt9903716,Jessie,Jessie,2019,2019-03-15,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,"Sritha Chandana, Pavani Gangireddy, Abhinav Go...","Set in an abandoned house, the film follows a ...",7.2,219,,,,,21.0,
81269,tt9905412,Ottam,Ottam,2019,2019-03-08,Drama,120,India,Malayalam,Zam,...,"Nandu Anand, Roshan Ullas, Manikandan R. Achar...","Set in Trivandrum, the story of Ottam unfolds ...",7.8,510,INR 4000000,,$ 4791,,,
81270,tt9905462,Pengalila,Pengalila,2019,2019-03-08,Drama,111,India,Malayalam,T.V. Chandran,...,"Lal, Akshara Kishor, Iniya, Narain, Renji Pani...",An unusual bond between a sixty year old Dalit...,8.4,604,INR 10000000,,,,,
81271,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",,8.4,369,,,,,,


In [13]:
data_movies[data_movies['metascore'].isnull()]

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,tt9903716,Jessie,Jessie,2019,2019-03-15,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,"Sritha Chandana, Pavani Gangireddy, Abhinav Go...","Set in an abandoned house, the film follows a ...",7.2,219,,,,,21.0,
81269,tt9905412,Ottam,Ottam,2019,2019-03-08,Drama,120,India,Malayalam,Zam,...,"Nandu Anand, Roshan Ullas, Manikandan R. Achar...","Set in Trivandrum, the story of Ottam unfolds ...",7.8,510,INR 4000000,,$ 4791,,,
81270,tt9905462,Pengalila,Pengalila,2019,2019-03-08,Drama,111,India,Malayalam,T.V. Chandran,...,"Lal, Akshara Kishor, Iniya, Narain, Renji Pani...",An unusual bond between a sixty year old Dalit...,8.4,604,INR 10000000,,,,,
81271,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",,8.4,369,,,,,,


In [None]:
data_movies = data_movies.drop('metascore', axis=1)

In [None]:
data_movies['date_published'] = pd.to_datetime(data_movies['date_published'])

In [None]:
data_movies['reviews_from_users'] = data_movies['reviews_from_users'].fillna(0)
data_movies['reviews_from_users'] = data_movies['reviews_from_users'].astype(int) 
data_movies.dtypes


In [None]:
data_movies['reviews_from_critics'] = data_movies['reviews_from_critics'].fillna(0)
data_movies['reviews_from_critics'] = data_movies['reviews_from_critics'].astype(int) 


In [None]:
data_movies['usa_gross_income'] = data_movies['usa_gross_income'].fillna(0)
 

In [None]:
data_movies.dtypes

In [None]:
dropped_cols_movies = ['worlwide_gross_income','budget','usa_gross_income']
data_movies.drop(dropped_cols_movies, axis=1, inplace=True)

In [None]:
data_movies_test= (data_movies.groupby(['year']).count()).reset_index()

In [None]:
data_movies_test

In [None]:
sns.catplot(data=data_movies_test, 
            x='year',
            y='title',
            height = 20,
            aspect =20/20);

In [None]:
fig, ax = plt.subplots(figsize = (12,8))

sns.scatterplot(data=data_movies_test,
            x='year',
            y='title',
            color='pink',
            s=100);

In [None]:
data_ratings

In [None]:
data_ratings.dtypes


In [None]:
data_ratings.isnull().sum()/len(data_ratings)*100

In [None]:
drop_ratings_cols = ['allgenders_0age_avg_vote', 'allgenders_0age_votes', 'males_0age_avg_vote', 'males_0age_votes', 'females_0age_avg_vote', 'females_0age_votes']
data_ratings = data_ratings.drop(drop_ratings_cols, axis=1)

In [None]:
data_ratings

In [None]:
data_ratings = data_ratings.apply(lambda x:x.fillna(0))

In [None]:
int_ratings = ['allgenders_18age_votes', 'allgenders_30age_votes', 'allgenders_45age_votes', 'males_allages_votes', 'males_18age_votes','males_30age_votes', 'males_45age_votes', 'females_allages_votes', 'females_18age_votes', 'females_30age_votes', 'females_45age_votes', 'top1000_voters_votes', 'us_voters_votes', 'non_us_voters_votes']

for col in int_ratings:
    data_ratings[col] = data_ratings[col].astype(int)

In [None]:
data_ratings.dtypes

In [None]:
#data_movies_drop = ['original_title','actors', 'description']
#data_movies = data_movies.drop(data_movies_drop, axis=1)

In [None]:
data_ratings.head(3)

In [None]:
data_movies = data_movies.to_csv('data_movies_clean.csv', index=False)
data_ratings = data_ratings.to_csv('data_ratings_clean.csv', index=False)