# Cleaning a CSV file for SQLite

In [2]:
import pandas as pd
awards = pd.read_csv('academy_awards.csv', encoding = 'ISO-8859-1')
awards.head(3)

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2010 (83rd),Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},NO,,,,,,
1,2010 (83rd),Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},NO,,,,,,
2,2010 (83rd),Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},NO,,,,,,


In [3]:
# Are the last 6 columns only NaN values?
awards.columns[5:11].value_counts()

Unnamed: 7     1
Unnamed: 6     1
Unnamed: 10    1
Unnamed: 8     1
Unnamed: 5     1
Unnamed: 9     1
dtype: int64

### Cleaning the 'Year' column

In [4]:
# select the first 4 elements in the string and convert to int
awards['Year']=awards['Year'].str[0:4].astype(int)

In [5]:
print(awards['Year'].dtype)
# pool only the data for years after 2000
later_than_2000 = awards[awards['Year']>2000]

int64


### Filtering by 'Category' type

In [6]:
# filter out only the rows that contain the below award_categories
# (the 4 awards we are interested in)
award_categories = ['Actor -- Leading Role','Actor -- Supporting Role','Actress -- Leading Role', 'Actress -- Supporting Role']

# Save in nominations DF
nominations = later_than_2000[later_than_2000['Category'].isin(award_categories)]
nominations.head(3)

Unnamed: 0,Year,Category,Nominee,Additional Info,Won?,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,2010,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},NO,,,,,,
1,2010,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},NO,,,,,,
2,2010,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},NO,,,,,,


### Converting 'Won?' to integer / boolean values
because SQLite uses integers to represent booleans

In [7]:
# Convert the won? column into integer for ease in processing
replace_dict = {'YES': 1, 'NO':0}
nominations['Won']=nominations['Won?'].map(replace_dict)
print(nominations['Won'][:5])

0    0
1    0
2    0
3    1
4    0
Name: Won, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


### Dropping unneeded columns

In [8]:
# Drop the last 6 columns since it only contains NULL values
cols = ['Won?','Unnamed: 5','Unnamed: 6','Unnamed: 7','Unnamed: 8','Unnamed: 9','Unnamed: 10',]
final_nominations = nominations.drop(cols,axis=1)

In [9]:
final_nominations.head(3)

Unnamed: 0,Year,Category,Nominee,Additional Info,Won
0,2010,Actor -- Leading Role,Javier Bardem,Biutiful {'Uxbal'},0
1,2010,Actor -- Leading Role,Jeff Bridges,True Grit {'Rooster Cogburn'},0
2,2010,Actor -- Leading Role,Jesse Eisenberg,The Social Network {'Mark Zuckerberg'},0


### Cleaning the 'Additional Info' column
Using:
- .str[index]
- .str.rstrip
- .str.split

In [10]:
additional_info_one = final_nominations['Additional Info'].str.rstrip("'}")
additional_info_two = additional_info_one.str.split("{'")
print (additional_info_two[:3])

0                        [Biutiful , Uxbal]
1             [True Grit , Rooster Cogburn]
2    [The Social Network , Mark Zuckerberg]
Name: Additional Info, dtype: object


In [11]:
# Extract the movie name and actors
movie_names= additional_info_two.str[0] 
characters = additional_info_two.str[1]

In [12]:
# Insert into final_nominations DF
final_nominations['Movie']=movie_names
final_nominations['Character']=characters
# and drop the additional info column
final_nominations=final_nominations.drop('Additional Info', axis=1)
final_nominations.head(4)

Unnamed: 0,Year,Category,Nominee,Won,Movie,Character
0,2010,Actor -- Leading Role,Javier Bardem,0,Biutiful,Uxbal
1,2010,Actor -- Leading Role,Jeff Bridges,0,True Grit,Rooster Cogburn
2,2010,Actor -- Leading Role,Jesse Eisenberg,0,The Social Network,Mark Zuckerberg
3,2010,Actor -- Leading Role,Colin Firth,1,The King's Speech,King George VI


# Exporting to SQLite
pd.to_sql(table_name, conn, if_exists='append/fail', chunksize='#', index=True/False)
- pd.to_sql creates table, then runs query to insert the DF
arguments:
- conn : connection
- chunksize : number of items per page when working with big data
- index : whether or now to use DF index as columns

In [13]:
# Connect to the desired database name
# if it doesn't exist, it will automatically create one
import sqlite3

conn = sqlite3.connect('nominations.db')

# convert DF to sql
final_nominations.to_sql('nominations', conn, index=False)

In [17]:
# Check to see if exported properly
table = conn.execute('pragma table_info(nominations)').fetchall()
print(table)

[(0, 'Year', 'INTEGER', 0, None, 0), (1, 'Category', 'TEXT', 0, None, 0), (2, 'Nominee', 'TEXT', 0, None, 0), (3, 'Won', 'INTEGER', 0, None, 0), (4, 'Movie', 'TEXT', 0, None, 0), (5, 'Character', 'TEXT', 0, None, 0)]


In [18]:
print (conn.execute('select * from nominations limit 10').fetchall())

[(2010, 'Actor -- Leading Role', 'Javier Bardem', 0, 'Biutiful ', 'Uxbal'), (2010, 'Actor -- Leading Role', 'Jeff Bridges', 0, 'True Grit ', 'Rooster Cogburn'), (2010, 'Actor -- Leading Role', 'Jesse Eisenberg', 0, 'The Social Network ', 'Mark Zuckerberg'), (2010, 'Actor -- Leading Role', 'Colin Firth', 1, "The King's Speech ", 'King George VI'), (2010, 'Actor -- Leading Role', 'James Franco', 0, '127 Hours ', 'Aron Ralston'), (2010, 'Actor -- Supporting Role', 'Christian Bale', 1, 'The Fighter ', 'Dicky Eklund'), (2010, 'Actor -- Supporting Role', 'John Hawkes', 0, "Winter's Bone ", 'Teardrop'), (2010, 'Actor -- Supporting Role', 'Jeremy Renner', 0, 'The Town ', 'James Coughlin'), (2010, 'Actor -- Supporting Role', 'Mark Ruffalo', 0, 'The Kids Are All Right ', 'Paul'), (2010, 'Actor -- Supporting Role', 'Geoffrey Rush', 0, "The King's Speech ", 'Lionel Logue')]


In [19]:
# close out of connection
conn.close()