# Imports

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

import warnings
warnings.filterwarnings("ignore")

from acquire import acquire_data

import prepare

# Acquire
- acquiring data from acquire.py script

In [2]:
train, test = acquire_data()

# Exploration / Prep

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3000 non-null   int64  
 1   belongs_to_collection  604 non-null    object 
 2   budget                 3000 non-null   int64  
 3   genres                 2993 non-null   object 
 4   homepage               946 non-null    object 
 5   imdb_id                3000 non-null   object 
 6   original_language      3000 non-null   object 
 7   original_title         3000 non-null   object 
 8   overview               2992 non-null   object 
 9   popularity             3000 non-null   float64
 10  poster_path            2999 non-null   object 
 11  production_companies   2844 non-null   object 
 12  production_countries   2945 non-null   object 
 13  release_date           3000 non-null   object 
 14  runtime                2998 non-null   float64
 15  spok

### what percentage of the data is missing?
- train data

In [4]:
prepare.percent_of_values_missing(train)

id                        0.00
belongs_to_collection    79.87
budget                    0.00
genres                    0.23
homepage                 68.47
imdb_id                   0.00
original_language         0.00
original_title            0.00
overview                  0.27
popularity                0.00
poster_path               0.03
production_companies      5.20
production_countries      1.83
release_date              0.00
runtime                   0.07
spoken_languages          0.67
status                    0.00
tagline                  19.90
title                     0.00
Keywords                  9.20
cast                      0.43
crew                      0.53
revenue                   0.00
dtype: float64

- test data

In [5]:
prepare.percent_of_values_missing(test)

id                        0.00
belongs_to_collection    80.06
budget                    0.00
genres                    0.36
homepage                 67.71
imdb_id                   0.00
original_language         0.00
original_title            0.00
overview                  0.32
popularity                0.00
poster_path               0.02
production_companies      5.87
production_countries      2.32
release_date              0.02
runtime                   0.09
spoken_languages          0.95
status                    0.05
tagline                  19.62
title                     0.07
Keywords                  8.94
cast                      0.30
crew                      0.50
dtype: float64

#### I need to learn more about this data
- I can see a few things I should do with this data, it's not as clean as i had originally though.

To Do's:
1. change id column to string (no math needs to be done to that column)
1. drop 'belongs to collection' column 80% null values
1. drop 'homepage' column, missing 68% of values
1. fill missing tagline with empty string
1. drop all other missing data

In [6]:
train.shape, test.shape

((3000, 23), (4398, 22))

## what is belongs to collection
- should i drop it?

In [7]:
# lets see what this column looks like
for i, e in enumerate(train['belongs_to_collection'][:5]):
    print(i, e)

0 [{'id': 313576, 'name': 'Hot Tub Time Machine Collection', 'poster_path': '/iEhb00TGPucF0b4joM1ieyY026U.jpg', 'backdrop_path': '/noeTVcgpBiD48fDjFVic1Vz7ope.jpg'}]
1 [{'id': 107674, 'name': 'The Princess Diaries Collection', 'poster_path': '/wt5AMbxPTS4Kfjx7Fgm149qPfZl.jpg', 'backdrop_path': '/zSEtYD77pKRJlUPx34BJgUG9v1c.jpg'}]
2 nan
3 nan
4 nan


- nested lists filled with movie id, movie title, the poster path, and the backdrop path
- it looks like the original dataframe used to grab the data from
- we don't need this, all of this data is **redundant**

- DROP!

In [8]:
def prep_belong(df):
    df.drop(columns='belongs_to_collection', inplace=True)
    return df

In [9]:
train = prep_belong(train)
test = prep_belong(test)

- done

## genres

In [10]:
len(train.genres)

3000

In [11]:
train.genres.isna().sum()

7

- there are 7 out of the 3,000 rows that are nan's

In [12]:
# what does this data look like?
for i, e in enumerate(train['genres'][:5]):
    print(i, e)

0 [{'id': 35, 'name': 'Comedy'}]
1 [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 10749, 'name': 'Romance'}]
2 [{'id': 18, 'name': 'Drama'}]
3 [{'id': 53, 'name': 'Thriller'}, {'id': 18, 'name': 'Drama'}]
4 [{'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}]


- another nested list
- we have movie id, and the genre, sometimes multiple libraies, probably because a movie is more than 1 genre
- don't 100% understand why these id's for the movie's genres are so oddly numbered tho...

I can either drop or impute the missing 7, or simply use the median 

#### most popular genres

In [13]:
train.genres.value_counts().head()

[{'id': 18, 'name': 'Drama'}]                                       266
[{'id': 35, 'name': 'Comedy'}]                                      186
[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]     108
[{'id': 35, 'name': 'Comedy'}, {'id': 10749, 'name': 'Romance'}]     88
[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}]          85
Name: genres, dtype: int64

- i'll just impute the 7 missing value's with the most common genre
- using sklearns simple imputer

In [14]:
# initalize the imputer w/ the strategy
imputer = SimpleImputer(strategy = 'most_frequent')
# fit the imputer
imputer = imputer.fit(train[['genres']])
# IMPUTE the genres columns of the train and test data
train[['genres']] = imputer.transform(train[['genres']])
test[['genres']] = imputer.transform(test[['genres']])

## production_companies

In [15]:
train.production_companies

0       [{'name': 'Paramount Pictures', 'id': 4}, {'na...
1             [{'name': 'Walt Disney Pictures', 'id': 2}]
2       [{'name': 'Bold Films', 'id': 2266}, {'name': ...
3                                                     NaN
4                                                     NaN
                              ...                        
2995    [{'name': 'Warner Bros.', 'id': 6194}, {'name'...
2996    [{'name': 'Memfis Film', 'id': 321}, {'name': ...
2997    [{'name': 'New Line Cinema', 'id': 12}, {'name...
2998    [{'name': 'Jersey Films', 'id': 216}, {'name':...
2999    [{'name': 'Lions Gate Films', 'id': 35}, {'nam...
Name: production_companies, Length: 3000, dtype: object

In [16]:
for i, e in enumerate(train['production_companies'][:5]):
    print(i, e)

0 [{'name': 'Paramount Pictures', 'id': 4}, {'name': 'United Artists', 'id': 60}, {'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8411}]
1 [{'name': 'Walt Disney Pictures', 'id': 2}]
2 [{'name': 'Bold Films', 'id': 2266}, {'name': 'Blumhouse Productions', 'id': 3172}, {'name': 'Right of Way Films', 'id': 32157}]
3 nan
4 nan


In [17]:
train.production_companies.isna().sum()

156

- looks like there are 156 null's out of 3k in this list of production companies

In [25]:
train['production_companies'].value_counts().head()

[{'name': 'Paramount Pictures', 'id': 4}]                          51
[{'name': 'Universal Pictures', 'id': 33}]                         36
[{'name': 'Warner Bros.', 'id': 6194}]                             32
[{'name': 'Metro-Goldwyn-Mayer (MGM)', 'id': 8411}]                32
[{'name': 'Twentieth Century Fox Film Corporation', 'id': 306}]    24
Name: production_companies, dtype: int64

- If i impute with the most common company I will ruin the dataset.

## production_countries

## tagline

## Keywords

## cast

## crew

In [19]:
train.genres[0]

"[{'id': 35, 'name': 'Comedy'}]"

In [20]:
train.Keywords[0]                                                                                                                                         

"[{'id': 4379, 'name': 'time travel'}, {'id': 9663, 'name': 'sequel'}, {'id': 11830, 'name': 'hot tub'}, {'id': 179431, 'name': 'duringcreditsstinger'}]"

In [21]:
train.cast[0]

"[{'cast_id': 4, 'character': 'Lou', 'credit_id': '52fe4ee7c3a36847f82afae7', 'gender': 2, 'id': 52997, 'name': 'Rob Corddry', 'order': 0, 'profile_path': '/k2zJL0V1nEZuFT08xUdOd3ucfXz.jpg'}, {'cast_id': 5, 'character': 'Nick', 'credit_id': '52fe4ee7c3a36847f82afaeb', 'gender': 2, 'id': 64342, 'name': 'Craig Robinson', 'order': 1, 'profile_path': '/tVaRMkJXOEVhYxtnnFuhqW0Rjzz.jpg'}, {'cast_id': 6, 'character': 'Jacob', 'credit_id': '52fe4ee7c3a36847f82afaef', 'gender': 2, 'id': 54729, 'name': 'Clark Duke', 'order': 2, 'profile_path': '/oNzK0umwm5Wn0wyEbOy6TVJCSBn.jpg'}, {'cast_id': 7, 'character': 'Adam Jr.', 'credit_id': '52fe4ee7c3a36847f82afaf3', 'gender': 2, 'id': 36801, 'name': 'Adam Scott', 'order': 3, 'profile_path': '/5gb65xz8bzd42yjMAl4zwo4cvKw.jpg'}, {'cast_id': 8, 'character': 'Hot Tub Repairman', 'credit_id': '52fe4ee7c3a36847f82afaf7', 'gender': 2, 'id': 54812, 'name': 'Chevy Chase', 'order': 4, 'profile_path': '/svjpyYtPwtjvRxX9IZnOmOkhDOt.jpg'}, {'cast_id': 9, 'character

In [22]:
train.crew[0]

'[{\'credit_id\': \'59ac067c92514107af02c8c8\', \'department\': \'Directing\', \'gender\': 0, \'id\': 1449071, \'job\': \'First Assistant Director\', \'name\': \'Kelly Cantley\', \'profile_path\': None}, {\'credit_id\': \'52fe4ee7c3a36847f82afad7\', \'department\': \'Directing\', \'gender\': 2, \'id\': 3227, \'job\': \'Director\', \'name\': \'Steve Pink\', \'profile_path\': \'/myHOgo8mQSCiCAZNGMRdHVr03jr.jpg\'}, {\'credit_id\': \'5524ed25c3a3687ded000d88\', \'department\': \'Writing\', \'gender\': 2, \'id\': 347335, \'job\': \'Writer\', \'name\': \'Josh Heald\', \'profile_path\': \'/pwXJIenrDMrG7t3zNfLvr8w1RGU.jpg\'}, {\'credit_id\': \'5524ed2d925141720c001128\', \'department\': \'Writing\', \'gender\': 2, \'id\': 347335, \'job\': \'Characters\', \'name\': \'Josh Heald\', \'profile_path\': \'/pwXJIenrDMrG7t3zNfLvr8w1RGU.jpg\'}, {\'credit_id\': \'5524ed3d92514166c1004a5d\', \'department\': \'Production\', \'gender\': 2, \'id\': 57822, \'job\': \'Producer\', \'name\': \'Andrew Panay\', \

In [23]:
df = prepare.prepare_data(train)

KeyError: "['belongs_to_collection'] not found in axis"

In [None]:
prepare.percent_of_values_missing(df)

In [None]:
df.shape, test.shape

In [None]:
train.overview[0]

In [None]:
train.production_companies[0]

In [None]:
train.production_countries[0]