In [2]:
import pandas as pd

In [3]:
movies = pd.read_csv("IMDb movies.csv", index_col="imdb_title_id")

In [4]:
movies.shape

(81273, 21)

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81273 entries, tt0000574 to tt9914286
Data columns (total 21 columns):
title                    81273 non-null object
original_title           81273 non-null object
year                     81273 non-null int64
date_published           81273 non-null object
genre                    81273 non-null object
duration                 81273 non-null int64
country                  81234 non-null object
language                 80518 non-null object
director                 81200 non-null object
writer                   79780 non-null object
production_company       76948 non-null object
actors                   81207 non-null object
description              78843 non-null object
avg_vote                 81273 non-null float64
votes                    81273 non-null int64
budget                   22804 non-null object
usa_gross_income         15094 non-null object
worlwide_gross_income    29892 non-null object
metascore                12722 non-null 

### Dealing with null values


* Dropping all null columns/rows
* Replacing the values with a specific value i.e mean, mode, median (measures of central tendencies)
#####  NB: Study on Normalization 

### Drop all columns with null values. At least one value as null, you drop it. 
* Determine how much data you have on your column. If your data is at least 60% filled ,avoid dropping your column as this would affect your sample (random or cluster). 
* Also factor this in deleting rows.

### Replacing with a MOCT
#### Do this when there are few values missing i.e at least 60 % is filled in, then take this approach
* Mean/Median - Column should be numeric,continous and falls on the real number line i.e exception to complex numbers.
* Mode - Column shoud be either categorical/ordinal/nominal.. i.e String/Text.
* Static Value - Replace the column with a static value e.g 'other'


In [6]:
double_df = movies.append(movies)

In [7]:
double_df.shape

(162546, 21)

In [8]:
double_df_one = double_df.drop_duplicates()

In [9]:
double_df_one.shape

(81273, 21)

In [10]:
double_df.shape

(162546, 21)

In [11]:
double_df.drop_duplicates(inplace=True, keep=False)

In [12]:
double_df.shape

(0, 21)

In [13]:
movies.shape

(81273, 21)

In [14]:
double_df.shape

(0, 21)

In [15]:
movies.shape

(81273, 21)

In [16]:
temp_movies = movies.copy()

In [17]:
temp_movies.shape

(81273, 21)

In [18]:
temp_movies.dropna(inplace=True)

In [19]:
temp_movies.shape

(6456, 21)

In [20]:
movies.shape

(81273, 21)

### Task:
* Create a temporary dataframe using the copy method
* Find out how to drop the columns which have less values e.g metascore
* Replace all null values in relatively filled in columns with the corresponding measure of central tendency e.g replace country missing values with the mode.

#### NB: Make sure all columns with null values are dealt with appropriatle in the temporary dataframe copy you created


In [21]:
temp_one = movies.copy()

In [22]:
temp_one.shape

(81273, 21)

In [23]:
temp_one.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81273 entries, tt0000574 to tt9914286
Data columns (total 21 columns):
title                    81273 non-null object
original_title           81273 non-null object
year                     81273 non-null int64
date_published           81273 non-null object
genre                    81273 non-null object
duration                 81273 non-null int64
country                  81234 non-null object
language                 80518 non-null object
director                 81200 non-null object
writer                   79780 non-null object
production_company       76948 non-null object
actors                   81207 non-null object
description              78843 non-null object
avg_vote                 81273 non-null float64
votes                    81273 non-null int64
budget                   22804 non-null object
usa_gross_income         15094 non-null object
worlwide_gross_income    29892 non-null object
metascore                12722 non-null 

In [24]:
rvc = temp_one['reviews_from_users']

In [25]:
rvc_mean =rvc.mean()

In [26]:
rvc.fillna(rvc_mean, inplace=True)

In [27]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 81273 entries, tt0000574 to tt9914286
Data columns (total 21 columns):
title                    81273 non-null object
original_title           81273 non-null object
year                     81273 non-null int64
date_published           81273 non-null object
genre                    81273 non-null object
duration                 81273 non-null int64
country                  81234 non-null object
language                 80518 non-null object
director                 81200 non-null object
writer                   79780 non-null object
production_company       76948 non-null object
actors                   81207 non-null object
description              78843 non-null object
avg_vote                 81273 non-null float64
votes                    81273 non-null int64
budget                   22804 non-null object
usa_gross_income         15094 non-null object
worlwide_gross_income    29892 non-null object
metascore                12722 non-null 

In [28]:
movies.to_csv("clean_movies.csv")

### Clean up all columns
* When you clean up your columns you run dataframe.to_csv("clean_movies.csv")

### Hint: use 
#### series.notna().mean()

##### Do this when there are few values missing i.e at least 60 % is filled in, then take this approach
* Mean/Median - Column should be numeric,continous and falls on the real number line i.e exception to complex numbers.
* Mode - Column shoud be either categorical/ordinal/nominal.. i.e String/Text.
* Static Value - Replace the column with a static value e.g 'other'


In [29]:
movies.head()

Unnamed: 0_level_0,title,original_title,year,date_published,genre,duration,country,language,director,writer,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
imdb_title_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,Gene Gauntier,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [45]:
ls = movies.country.unique()

In [46]:
ls.shape

(4633,)