# Exploratory data analysis

In [None]:
import pandas as pd
import numpy as np

## Import Movie Lens Data Base

In [None]:
#importar databasese Movie Lens e checar tamanhos
data_path = '../raw_data/ml-25m/'
g_scores = pd.read_csv(data_path + 'genome-scores.csv')#, nrows = 1000000)
g_tags = pd.read_csv(data_path +'genome-tags.csv')#, nrows = 1000000)
links = pd.read_csv(data_path + 'links.csv')
movies = pd.read_csv(data_path + 'movies.csv')
ratings = pd.read_csv(data_path + 'ratings.csv') #nrows = 1000000)
tags = pd.read_csv(data_path + 'tags.csv')

In [None]:
print(f'Formato da base g_scores:',(g_scores.shape))
print(f'Formato da base g_tags:',(g_tags.shape))
print(f'Formato da base links:',(links.shape))
print(f'Formato da base movies:',(movies.shape))
print(f'Formato da base ratings:',(ratings.shape))
print(f'Formato da base tags:',(tags.shape))

## Análise exploratória:

### Genome-scores:


In [None]:
g_scores.head(4)

In [None]:
print(g_scores.isnull().sum())
g_scores.info()

### Genome-tags:

In [None]:
g_tags.head(4)

In [None]:
print(g_tags.isnull().sum())
g_tags.info()

### Links:

In [None]:
links.head(4)

In [None]:
print(links.isnull().sum())
links.info()

### Movies:

In [None]:
movies.head(4)

In [None]:
print(movies.isnull().sum())
movies.info()

### Ratings:

In [None]:
ratings.head(4)

In [None]:
print(ratings.isnull().sum())
ratings.info()

### Tags:


In [None]:
tags.head()

In [None]:
print(tags.isnull().sum())
tags.info()

We can exclude 16 tags because we have 1093360 tags on the movies

Which movies don't have tags?

### Movies without TMDB ID

In [None]:
#criação de dataframe missing_movies
missing_movies = links.copy()
#criação de campo 'empty' no dataframe
missing_movies['empty'] = missing_movies.tmdbId.notnull()
#check a na quantidade de valores sem tmdbId
print(missing_movies['empty'].value_counts())
missing_movies = missing_movies[missing_movies['empty'] == False]
missing_movies = missing_movies.merge(movies,on='movieId')
missing_movies.head(5)

We can exclude the 107 movies that don't have tmdbId!

### Movies without tags

In [None]:
movie_ids_without_tags = tags[tags.tag.isnull()].movieId.count()

In [None]:
print(f'Number of movies without tags: {movie_ids_without_tags}')

We can exclude 16 entries from Tags as they are non significant

### Standardizing movie genres:


In [None]:
movies = movies.copy()
movies['genres'] = movies['genres'].str.replace("|"," ")

In [None]:
#função para transformar string em lista 
def movie_genres(x):
    # remover (no genres listed) 
    if x == '(no genres listed)':
        return ""
    else:
        return x.split()   

In [None]:
movies['genres'] = movies['genres'].apply(movie_genres)
movies.head(3)

In [None]:
movies.head(4)

In [None]:
test = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
test = [genre.lower() for genre in test]
test

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

labels = movies['genres']
mlb = MultiLabelBinarizer()
results = mlb.fit_transform(labels)
print(mlb.classes_)
col = list(mlb.classes_)
col = [genre.lower() for genre in col]
print(col)
print(results)

movies = movies.join(pd.DataFrame(results, 
                     columns=col,
                    index = movies.index))

In [None]:
movies.head(4)

## Movie Lens - Data cleaning

### Merging Movies and Link database and removing 107 movies without TMDB ID

In [None]:
movies = links.merge(movies, on='movieId', how='right')
print(movies.shape)
movies.head(1)

In [None]:
movies = movies.dropna(axis=0, subset=['tmdbId'])

In [None]:
print(links.movieId.max())
print(links.imdbId.max())
print(links.tmdbId.max())
print("int16 max: 32.767")
print("int32 max: 2.147.483.647")

In [None]:
movies.movieId = movies.movieId.astype(np.int32)
movies.imdbId = movies.imdbId.astype(np.int32)
movies.tmdbId = movies.tmdbId.astype(np.int32)

In [None]:
movies.head(4)

### Removing NaN entries from tags

In [None]:
tags = tags.dropna(axis=0, subset=['tag'])
tags.info()

## Import Data from API TMDB

*Link para documentação:* \
https://www.themoviedb.org/documentation/api