In [1]:
import pandas as pd
import requests

In [2]:
movie_data = pd.read_csv('MovieGenre.csv', encoding='latin-1')

In [3]:
movie_data.head()

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...


##### Now I want to see the unique Genres in the dataframe

In [4]:
movie_data['Genre'].unique()

array(['Animation|Adventure|Comedy', 'Action|Adventure|Family',
       'Comedy|Romance', ..., 'Horror|Comedy|Sci-Fi',
       'Drama|History|Sci-Fi', 'Documentary|Comedy|Family'], dtype=object)

##### I would like to categorize for the first ten genres. I could unnecessarily randomize this. I just choose not to.

In [5]:
unwanted_genre = movie_data['Genre'].unique()[10:]
unwanted_genre

array(['Comedy|Fantasy|Horror', 'Animation|Adventure|Drama',
       'Biography|Drama|History', ..., 'Horror|Comedy|Sci-Fi',
       'Drama|History|Sci-Fi', 'Documentary|Comedy|Family'], dtype=object)

In [6]:
len(unwanted_genre)

1298

##### Here I am going to keep the movies that are not in the selected genres. 

In [7]:
new_data = movie_data[~movie_data['Genre'].isin(unwanted_genre)]

In [8]:
new_data

Unnamed: 0,imdbId,Imdb Link,Title,IMDB Score,Genre,Poster
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),8.3,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),6.9,Action|Adventure|Family,https://images-na.ssl-images-amazon.com/images...
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),6.6,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),5.7,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),5.9,Comedy|Family|Romance,https://images-na.ssl-images-amazon.com/images...
...,...,...,...,...,...,...
40072,4550420,http://www.imdb.com/title/tt4550420,My Blind Brother (2016),6.0,Comedy|Drama|Romance,https://images-na.ssl-images-amazon.com/images...
40075,1080012,http://www.imdb.com/title/tt1080012,Faintheart (2008),6.4,Comedy|Romance,https://images-na.ssl-images-amazon.com/images...
40079,1135972,http://www.imdb.com/title/tt1135972,Le Grand Chef (2007),6.7,Comedy|Drama,https://images-na.ssl-images-amazon.com/images...
40081,1679335,http://www.imdb.com/title/tt1679335,Trolls (2016),6.5,Animation|Adventure|Comedy,https://images-na.ssl-images-amazon.com/images...


###### Yes, this will surely give me ten different genres.

In [9]:
new_data['Genre'].unique()

array(['Animation|Adventure|Comedy', 'Action|Adventure|Family',
       'Comedy|Romance', 'Comedy|Drama|Romance', 'Comedy|Family|Romance',
       'Action|Crime|Drama', 'Comedy|Drama', 'Adventure|Comedy|Drama',
       'Action|Crime|Thriller', 'Action|Adventure|Thriller'], dtype=object)

##### I am creating another table just based on different genres. They will be automatically in serial with the original dataframe.

In [10]:
genre = pd.get_dummies(new_data['Genre'])

In [11]:
genre

Unnamed: 0,Action|Adventure|Family,Action|Adventure|Thriller,Action|Crime|Drama,Action|Crime|Thriller,Adventure|Comedy|Drama,Animation|Adventure|Comedy,Comedy|Drama,Comedy|Drama|Romance,Comedy|Family|Romance,Comedy|Romance
0,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
40072,0,0,0,0,0,0,0,1,0,0
40075,0,0,0,0,0,0,0,0,0,1
40079,0,0,0,0,0,0,1,0,0,0
40081,0,0,0,0,0,1,0,0,0,0


##### I really don't think you'd need the other columns except title. So I am dropping it here.

In [12]:
new_data.drop(new_data.columns.difference(['Title']), 1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [13]:
new_data

Unnamed: 0,Title
0,Toy Story (1995)
1,Jumanji (1995)
2,Grumpier Old Men (1995)
3,Waiting to Exhale (1995)
4,Father of the Bride Part II (1995)
...,...
40072,My Blind Brother (2016)
40075,Faintheart (2008)
40079,Le Grand Chef (2007)
40081,Trolls (2016)


##### I am adding(concatenating) the genre table here with new_data. You can match this with the original data.

In [14]:
new_data = pd.concat([new_data, genre], axis=1)

In [15]:
new_data.head()

Unnamed: 0,Title,Action|Adventure|Family,Action|Adventure|Thriller,Action|Crime|Drama,Action|Crime|Thriller,Adventure|Comedy|Drama,Animation|Adventure|Comedy,Comedy|Drama,Comedy|Drama|Romance,Comedy|Family|Romance,Comedy|Romance
0,Toy Story (1995),0,0,0,0,0,1,0,0,0,0
1,Jumanji (1995),1,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men (1995),0,0,0,0,0,0,0,0,0,1
3,Waiting to Exhale (1995),0,0,0,0,0,0,0,1,0,0
4,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,1,0


# Thank You