# I. Data Cleaning

1. [Import libraries and scr.py file](#import-libraries)
2. [Import & clean documents](#import-clean-documents)
    - 2.1. [actors.csv](#import-actors)
    - 2.2. [category.csv](#import-category)
    - 2.3. [film.csv](#import-film)
    - 2.4. [inventory.csv](#import-inventory)
    - 2.5. [language.csv](#import-language)
    - 2.6. [old_HDD.csv](#import-old-hdd)
    - 2.7. [rental.csv](#import-rental)
3. [Export documents](#export-documents)


## 1.Import libraries and scr.py file

In [348]:
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

In [349]:
import scr as sc #Functions

In [350]:
import pylab as plt  
import seaborn as sns

%matplotlib inline

## 2. Import & clean documents

### 2.1 actors.csv

In [351]:
actors = pd.read_csv('../data/0-raw/actor.csv')
actors.head()

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1,PENELOPE,GUINESS,2006-02-15 04:34:33
1,2,NICK,WAHLBERG,2006-02-15 04:34:33
2,3,ED,CHASE,2006-02-15 04:34:33
3,4,JENNIFER,DAVIS,2006-02-15 04:34:33
4,5,JOHNNY,LOLLOBRIGIDA,2006-02-15 04:34:33


In [352]:
actors.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   actor_id     200 non-null    int64 
 1   first_name   200 non-null    object
 2   last_name    200 non-null    object
 3   last_update  200 non-null    object
dtypes: int64(1), object(3)
memory usage: 41.0 KB


In [353]:
actors.shape

(200, 4)

In [354]:
sc.check_nan(actors)

'N nan cols: 0'

Series([], dtype: float64)

In [355]:
#Column cleaning function

actors= sc.column_unification(actors)

In [356]:
actors[actors.duplicated(["first_name","last_name"])]

Unnamed: 0,actor_id,first_name,last_name,last_update
109,110,SUSAN,DAVIS,2006-02-15 04:34:33


In [357]:
actors.drop(index=109, inplace=True)

In [358]:
actors[actors.duplicated(['first_name','last_name'])]

Unnamed: 0,actor_id,first_name,last_name,last_update


##### "first_name" & "last_name"

In [359]:
actors['first_name']= actors['first_name'].str.capitalize()

actors['last_name']= actors['last_name'].str.capitalize()

actors.head()

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1,Penelope,Guiness,2006-02-15 04:34:33
1,2,Nick,Wahlberg,2006-02-15 04:34:33
2,3,Ed,Chase,2006-02-15 04:34:33
3,4,Jennifer,Davis,2006-02-15 04:34:33
4,5,Johnny,Lollobrigida,2006-02-15 04:34:33


##### "last_update"

###### Drop  column (constant)

In [360]:
actors= actors.drop('last_update', axis=1)

### 2.2 category.csv

In [361]:
category = pd.read_csv('../data/0-raw/category.csv')
category.head(2)

Unnamed: 0,category_id,name,last_update
0,1,Action,2006-02-15 04:46:27
1,2,Animation,2006-02-15 04:46:27


In [362]:
category.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16 entries, 0 to 15
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   category_id  16 non-null     int64 
 1   name         16 non-null     object
 2   last_update  16 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.4 KB


In [363]:
category.shape

(16, 3)

In [364]:
sc.check_nan(category)

'N nan cols: 0'

Series([], dtype: float64)

In [365]:
#Column cleaning function

category= sc.column_unification(category)

In [366]:
category.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
category_id,16.0,,,,8.5,4.760952,1.0,4.75,8.5,12.25,16.0
name,16.0,16.0,Action,1.0,,,,,,,
last_update,16.0,1.0,2006-02-15 04:46:27,16.0,,,,,,,


##### "name"

In [367]:
category.name.value_counts()

name
Action         1
Animation      1
Children       1
Classics       1
Comedy         1
Documentary    1
Drama          1
Family         1
Foreign        1
Games          1
Horror         1
Music          1
New            1
Sci-Fi         1
Sports         1
Travel         1
Name: count, dtype: int64

##### "last_update"

###### Drop  column (constant)

In [368]:
category = category.drop('last_update', axis=1)

### 2.3 film.csv

In [369]:
film = pd.read_csv('../data/0-raw/film.csv')
film.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42


In [370]:
film.info(memory_usage='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   film_id               1000 non-null   int64  
 1   title                 1000 non-null   object 
 2   description           1000 non-null   object 
 3   release_year          1000 non-null   int64  
 4   language_id           1000 non-null   int64  
 5   original_language_id  0 non-null      float64
 6   rental_duration       1000 non-null   int64  
 7   rental_rate           1000 non-null   float64
 8   length                1000 non-null   int64  
 9   replacement_cost      1000 non-null   float64
 10  rating                1000 non-null   object 
 11  special_features      1000 non-null   object 
 12  last_update           1000 non-null   object 
dtypes: float64(3), int64(5), object(5)
memory usage: 101.7+ KB


In [371]:
sc.check_nan(film)

'N nan cols: 1'

original_language_id    100.0
dtype: float64

In [372]:
#Column cleaning function

film= sc.column_unification(film)

In [373]:
film.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
film_id,1000.0,,,,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
title,1000.0,1000.0,ACADEMY DINOSAUR,1.0,,,,,,,
description,1000.0,1000.0,A Epic Drama of a Feminist And a Mad Scientist...,1.0,,,,,,,
release_year,1000.0,,,,2006.0,0.0,2006.0,2006.0,2006.0,2006.0,2006.0
language_id,1000.0,,,,1.0,0.0,1.0,1.0,1.0,1.0,1.0
original_language_id,0.0,,,,,,,,,,
rental_duration,1000.0,,,,4.985,1.411654,3.0,4.0,5.0,6.0,7.0
rental_rate,1000.0,,,,2.98,1.646393,0.99,0.99,2.99,4.99,4.99
length,1000.0,,,,115.272,40.426332,46.0,80.0,114.0,149.25,185.0
replacement_cost,1000.0,,,,19.984,6.050833,9.99,14.99,19.99,24.99,29.99


In [374]:
film.shape

(1000, 13)

In [375]:
film[film.duplicated(['title'])]

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update


#### "title"

In [376]:
film.title= film.title.str.upper()

film.description= film.description.str.upper()

film.head()

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A EPIC DRAMA OF A FEMINIST AND A MAD SCIENTIST...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A ASTOUNDING EPISTLE OF A DATABASE ADMINISTRAT...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A ASTOUNDING REFLECTION OF A LUMBERJACK AND A ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42
3,4,AFFAIR PREJUDICE,A FANCIFUL DOCUMENTARY OF A FRISBEE AND A LUMB...,2006,1,,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42
4,5,AFRICAN EGG,A FAST-PACED DOCUMENTARY OF A PASTRY CHEF AND ...,2006,1,,6,2.99,130,22.99,G,Deleted Scenes,2006-02-15 05:03:42


##### "original_language_id"

###### Replace null by 0 to film.original_language_id 

In [377]:
film.original_language_id=0

film.original_language_id.value_counts()

original_language_id
0    1000
Name: count, dtype: int64

##### "last_update"

###### Drop  column (constant)

In [378]:
film= film.drop('last_update', axis=1)

### 2.4 inventory.csv

In [379]:
inventory = pd.read_csv('../data/0-raw/inventory.csv')
inventory.head()

Unnamed: 0,inventory_id,film_id,store_id,last_update
0,1,1,1,2006-02-15 05:09:17
1,2,1,1,2006-02-15 05:09:17
2,3,1,1,2006-02-15 05:09:17
3,4,1,1,2006-02-15 05:09:17
4,5,1,2,2006-02-15 05:09:17


In [380]:
inventory.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   inventory_id  1000 non-null   int64 
 1   film_id       1000 non-null   int64 
 2   store_id      1000 non-null   int64 
 3   last_update   1000 non-null   object
dtypes: int64(3), object(1)
memory usage: 97.8 KB


In [381]:
inventory.shape

(1000, 4)

In [382]:
sc.check_nan(inventory)

'N nan cols: 0'

Series([], dtype: float64)

In [383]:
#Column cleaning function

inventory= sc.column_unification(inventory)

In [384]:
inventory.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
inventory_id,1000.0,,,,500.5,288.819436,1.0,250.75,500.5,750.25,1000.0
film_id,1000.0,,,,109.866,63.862042,1.0,56.0,111.5,164.0,223.0
store_id,1000.0,,,,1.497,0.500241,1.0,1.0,1.0,2.0,2.0
last_update,1000.0,1.0,2006-02-15 05:09:17,1000.0,,,,,,,


In [385]:
inventory.duplicated().any()

False

##### "last_update"

In [386]:
inventory.last_update.value_counts()

last_update
2006-02-15 05:09:17    1000
Name: count, dtype: int64

###### Drop  column (constant)

In [387]:
inventory= inventory.drop('last_update', axis=1)

##### "store_id"

In [388]:
inventory.store_id.value_counts()

store_id
1    503
2    497
Name: count, dtype: int64

In [389]:
inventory[inventory.duplicated()]

Unnamed: 0,inventory_id,film_id,store_id


In [390]:
inventory.dtypes

inventory_id    int64
film_id         int64
store_id        int64
dtype: object

### 2.5 language.csv

In [391]:
language = pd.read_csv('../data/0-raw/language.csv')
language.head(2)

Unnamed: 0,language_id,name,last_update
0,1,English,2006-02-15 05:02:19
1,2,Italian,2006-02-15 05:02:19


In [392]:
language.shape

(6, 3)

In [393]:
sc.check_nan(language)

'N nan cols: 0'

Series([], dtype: float64)

In [394]:
#Column cleaning function

language= sc.column_unification(language)

In [395]:
language.info(memory_usage='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   language_id  6 non-null      int64 
 1   name         6 non-null      object
 2   last_update  6 non-null      object
dtypes: int64(1), object(2)
memory usage: 272.0+ bytes


In [396]:
language[language.duplicated(['name'])]

Unnamed: 0,language_id,name,last_update


##### "last_update"

###### Drop  column (constant)

In [397]:
language= language.drop('last_update', axis=1)

### 2.6 old_HDD.csv

In [398]:
old_HDD = pd.read_csv('../data/0-raw/old_HDD.csv')
old_HDD.head(2)

Unnamed: 0,first_name,last_name,title,release_year,category_id
0,PENELOPE,GUINESS,ACADEMY DINOSAUR,2006,6
1,PENELOPE,GUINESS,ANACONDA CONFESSIONS,2006,2


In [399]:
old_HDD.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   first_name    1000 non-null   object
 1   last_name     1000 non-null   object
 2   title         1000 non-null   object
 3   release_year  1000 non-null   int64 
 4   category_id   1000 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 207.6 KB


In [400]:
old_HDD.shape

(1000, 5)

In [401]:
sc.check_nan(old_HDD)

'N nan cols: 0'

Series([], dtype: float64)

In [402]:
#Column cleaning function

old_HDD= sc.column_unification(old_HDD)

##### "first_name" & "last_name"

In [403]:
old_HDD['first_name']= old_HDD['first_name'].str.capitalize()

old_HDD['last_name']= old_HDD['last_name'].str.capitalize()

old_HDD.head()

Unnamed: 0,first_name,last_name,title,release_year,category_id
0,Penelope,Guiness,ACADEMY DINOSAUR,2006,6
1,Penelope,Guiness,ANACONDA CONFESSIONS,2006,2
2,Penelope,Guiness,ANGELS LIFE,2006,13
3,Penelope,Guiness,BULWORTH COMMANDMENTS,2006,10
4,Penelope,Guiness,CHEAPER CLYDE,2006,14


In [404]:
old_HDD[old_HDD.duplicated(['first_name','last_name', 'title'])]

Unnamed: 0,first_name,last_name,title,release_year,category_id


In [405]:
old_HDD.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
first_name,1000.0,38.0,Sandra,56.0,,,,,,,
last_name,1000.0,37.0,Olivier,53.0,,,,,,,
title,1000.0,614.0,BOONDOCK BALLROOM,6.0,,,,,,,
release_year,1000.0,,,,2006.0,0.0,2006.0,2006.0,2006.0,2006.0,2006.0
category_id,1000.0,,,,8.355,4.726872,1.0,4.0,8.0,13.0,16.0


#### "release_year" 

##### It is a constant column. And this table will be merged with films and actors, so it could be drop

In [406]:
old_HDD.release_year.value_counts()

release_year
2006    1000
Name: count, dtype: int64

In [407]:
old_HDD= old_HDD.drop('release_year', axis=1)

### 2.7 rental.csv

In [408]:
rental = pd.read_csv('../data/0-raw/rental.csv')
rental.head()

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 21:30:53
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 21:30:53
2,3,2005-05-24 23:03:39,1711,408,2005-06-01 22:12:39,1,2006-02-15 21:30:53
3,4,2005-05-24 23:04:41,2452,333,2005-06-03 01:43:41,2,2006-02-15 21:30:53
4,5,2005-05-24 23:05:21,2079,222,2005-06-02 04:33:21,1,2006-02-15 21:30:53


In [409]:
rental.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   rental_id     1000 non-null   int64 
 1   rental_date   1000 non-null   object
 2   inventory_id  1000 non-null   int64 
 3   customer_id   1000 non-null   int64 
 4   return_date   1000 non-null   object
 5   staff_id      1000 non-null   int64 
 6   last_update   1000 non-null   object
dtypes: int64(4), object(3)
memory usage: 254.0 KB


In [410]:
rental.shape

(1000, 7)

In [411]:
sc.check_nan(rental)

'N nan cols: 0'

Series([], dtype: float64)

In [412]:
#Column cleaning function

rental= sc.column_unification(rental)

In [413]:
rental.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
rental_id,1000.0,,,,501.18,289.19665,1.0,250.75,501.5,751.25,1001.0
rental_date,1000.0,999.0,2005-05-30 14:47:31,2.0,,,,,,,
inventory_id,1000.0,,,,2258.179,1314.667454,2.0,1157.5,2244.5,3371.75,4581.0
customer_id,1000.0,,,,296.408,172.509319,1.0,150.0,296.0,445.25,597.0
return_date,1000.0,997.0,2005-06-05 15:16:54,2.0,,,,,,,
staff_id,1000.0,,,,1.521,0.499809,1.0,1.0,2.0,2.0,2.0
last_update,1000.0,1.0,2006-02-15 21:30:53,1000.0,,,,,,,


In [414]:
rental.duplicated().any()

False

##### "rental_id"

In [415]:
#It has 1001 values. It's need to be reset. It isn't in any other table
rental.rental_id= [e for e in range(1,1001)]

##### "last_update"

###### Drop  column (constant)

In [416]:
rental= rental.drop('last_update', axis=1)

In [417]:
category

Unnamed: 0,category_id,name
0,1,Action
1,2,Animation
2,3,Children
3,4,Classics
4,5,Comedy
5,6,Documentary
6,7,Drama
7,8,Family
8,9,Foreign
9,10,Games


## 3. Export documents

In [341]:
actors.to_csv('../data/1-clean/actor.csv', index=False)

In [342]:
category.to_csv('../data/1-clean/category.csv', index=False)

In [343]:
film.to_csv('../data/1-clean/film.csv', index=False)

In [344]:
inventory.to_csv('../data/1-clean/inventory.csv', index=False)

In [345]:
language.to_csv('../data/1-clean/language.csv', index=False)

In [346]:
old_HDD.to_csv('../data/1-clean/old_HDD.csv', index=False)

In [347]:
rental.to_csv('../data/1-clean/rental.csv', index=False)

### DONE CLEANING!