In [1]:
#importing libraries
import pandas as pd
from dfply import *
import seaborn as sns
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import os
%matplotlib inline

In [24]:
#csv_database.dispose()
#os.remove('csv_database.db')

In [2]:
#setting display options for pandas
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [3]:
#coding ideas and comments from https://stackoverflow.com/questions/42900757/sequentially-read-huge-csv-file-in-python/42960918
#trying to understand what would be the best way to handle these data if they were huge
file = '/Users/ariafredman/Documents/data_science/data/star_wars_info_data/characters.csv'
#checking out the first 5 rows of the data
#not necessary for these small data
#but could be good if our data are super large
print(pd.read_csv(file, nrows = 5))

             name  height  mass hair_color   skin_color eye_color birth_year  \
0  Luke Skywalker     172    77      blond         fair      blue      19BBY   
1           C-3PO     167    75        NaN         gold    yellow     112BBY   
2           R2-D2      96    32        NaN  white, blue       red      33BBY   
3     Darth Vader     202   136       none        white    yellow    41.9BBY   
4     Leia Organa     150    49      brown        light     brown      19BBY   

   gender homeworld species  
0    male  Tatooine   Human  
1     NaN  Tatooine   Droid  
2     NaN     Naboo   Droid  
3    male  Tatooine   Human  
4  female  Alderaan   Human  


In [100]:
#creating the sqllite database
csv_database = create_engine('sqlite:///csv_database.db')

In [101]:
#iterating through the CSV file in chunks and store the data into sqllite
#setting the chunksize at 10 which means we'll have 10 rows per chunk
#The for loop read a chunk of data from the CSV file, 
#If you want to remove space from any of column names
#use df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) 
#then stores the chunk into the sqllite database (df.to_sql(…))
chunksize = 10
index_offset = 1
for df in pd.read_csv(file, chunksize = chunksize, iterator = True):
      df.index += index_offset
      df.to_sql('table', csv_database, if_exists = 'append')
      index_offset += 1

In [102]:
#we can now pull the data from the query
df1 = pd.read_sql_query("SELECT * FROM 'table' LIMIT 5", csv_database)
print(df1)

   index            name  height  mass hair_color   skin_color eye_color  \
0      1  Luke Skywalker     172    77      blond         fair      blue   
1      2           C-3PO     167    75       None         gold    yellow   
2      3           R2-D2      96    32       None  white, blue       red   
3      4     Darth Vader     202   136       none        white    yellow   
4      5     Leia Organa     150    49      brown        light     brown   

  birth_year  gender homeworld species  
0      19BBY    male  Tatooine   Human  
1     112BBY    None  Tatooine   Droid  
2      33BBY    None     Naboo   Droid  
3    41.9BBY    male  Tatooine   Human  
4      19BBY  female  Alderaan   Human  


In [103]:
df_char_sql = pd.read_sql_query("SELECT * FROM 'table'", csv_database)

In [26]:
#another option if you can pull it all into memory
#but maybe the reading in part is the problematic one
#this data output isn't dataframe, but pandas.io.parsers.TextFileReader
it_df_char = pd.read_csv(file, iterator = True, chunksize=10)
print(it_df_char)

<pandas.io.parsers.TextFileReader object at 0x7fdb19b6a510>


In [42]:
#If you need a dataframe use concat for all chunks to df
chunks_list = []

for chunk in pd.read_csv(file, iterator = True, chunksize = 10):
    chunks_list.append(chunk)
    
#we can use this to see how many chunks we have:
print("Number of chunks: ", len(l1))

df_char_it = pd.concat(l1, ignore_index = True)

print(df_char_it.head(9))
print(df_char_it.shape)

Number of chunks:  9
                 name  height mass   hair_color   skin_color eye_color  \
0      Luke Skywalker   172.0   77        blond         fair      blue   
1               C-3PO   167.0   75          NaN         gold    yellow   
2               R2-D2    96.0   32          NaN  white, blue       red   
3         Darth Vader   202.0  136         none        white    yellow   
4         Leia Organa   150.0   49        brown        light     brown   
5           Owen Lars   178.0  120  brown, grey        light      blue   
6  Beru Whitesun lars   165.0   75        brown        light      blue   
7               R5-D4    97.0   32          NaN   white, red       red   
8   Biggs Darklighter   183.0   84        black        light     brown   

  birth_year  gender homeworld species  
0      19BBY    male  Tatooine   Human  
1     112BBY     NaN  Tatooine   Droid  
2      33BBY     NaN     Naboo   Droid  
3    41.9BBY    male  Tatooine   Human  
4      19BBY  female  Alderaan   

In [41]:
#reading the data in the usual way
df_char_usual = pd.read_csv('/Users/ariafredman/Documents/data_science/data/star_wars_info_data/characters.csv')
df_char_usual.head(9)

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld,species
0,Luke Skywalker,172.0,77,blond,fair,blue,19BBY,male,Tatooine,Human
1,C-3PO,167.0,75,,gold,yellow,112BBY,,Tatooine,Droid
2,R2-D2,96.0,32,,"white, blue",red,33BBY,,Naboo,Droid
3,Darth Vader,202.0,136,none,white,yellow,41.9BBY,male,Tatooine,Human
4,Leia Organa,150.0,49,brown,light,brown,19BBY,female,Alderaan,Human
5,Owen Lars,178.0,120,"brown, grey",light,blue,52BBY,male,Tatooine,Human
6,Beru Whitesun lars,165.0,75,brown,light,blue,47BBY,female,Tatooine,Human
7,R5-D4,97.0,32,,"white, red",red,,,Tatooine,Droid
8,Biggs Darklighter,183.0,84,black,light,brown,24BBY,male,Tatooine,Human


In [44]:
#Checking that all df_char_usual and df_char_it are the same
df_char_usual.equals(df_char_it)
#these aren't identical

False

In [92]:
#making them all objects doesn't seem to have fixed the issue
df_char_usualstr = df_char_usual.copy(deep = True).astype(str)
df_char_itstr = df_char_it.copy(deep = True).astype(str)
print('Are df_char_usualstr and df_char_itstr identical? \n', (df_char_usualstr).equals(df_char_itstr))
print('df_char_itstr data types: \n', df_char_itstr.dtypes)
print('df_char_usualstr data types: \n', df_char_usualstr.dtypes)

Are df_char_usualstr and df_char_itstr identical? 
 False
df_char_itstr data types: 
 name          object
height        object
mass          object
hair_color    object
skin_color    object
eye_color     object
birth_year    object
gender        object
homeworld     object
species       object
dtype: object
df_char_usualstr data types: 
 name          object
height        object
mass          object
hair_color    object
skin_color    object
eye_color     object
birth_year    object
gender        object
homeworld     object
species       object
dtype: object


In [119]:
#creating an array to see where does each value matchs and misses
comparison_array = df_char_usual.values == df_char_it.values
#first 5 rows
print(comparison_array[0:5])

[[ True  True False  True  True  True  True  True  True  True]
 [ True  True False False  True  True  True False  True  True]
 [ True  True False False  True  True  True False  True  True]
 [ True  True False  True  True  True  True  True  True  True]
 [ True  True False  True  True  True  True  True  True  True]]


In [128]:
#these are the cells that are supposed to be problematic in the 1st row
print(df_char_usual.iloc[0,2])
print(df_char_it.iloc[0,2])
#python thinks these are different
print(df_char_usual.iloc[0,2] == df_char_it.iloc[0,2])
print(str(df_char_usual.iloc[0,2]) == str(df_char_it.iloc[0,2]))
#I think this proves that the data are equivalent, 
#but something about converting the entire dataframe and/or how pandas is evaluating things is messed up

77
77
False
True


In [104]:
#so these also aren't identical
#so let's look into that a bit more
print(df_char_usual.equals(df_char_sql))
print(df_char_it.equals(df_char_sql))

False
False


In [105]:
print(df_char_sql.head())
#it looks like it may just be the extra index column

   index            name  height mass hair_color   skin_color eye_color  \
0      1  Luke Skywalker   172.0   77      blond         fair      blue   
1      2           C-3PO   167.0   75       None         gold    yellow   
2      3           R2-D2    96.0   32       None  white, blue       red   
3      4     Darth Vader   202.0  136       none        white    yellow   
4      5     Leia Organa   150.0   49      brown        light     brown   

  birth_year  gender homeworld species  
0      19BBY    male  Tatooine   Human  
1     112BBY    None  Tatooine   Droid  
2      33BBY    None     Naboo   Droid  
3    41.9BBY    male  Tatooine   Human  
4      19BBY  female  Alderaan   Human  


In [106]:
df_char_sql = df_char_sql >> drop(['index'])

In [107]:
df_char_sql.head()

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld,species
0,Luke Skywalker,172.0,77,blond,fair,blue,19BBY,male,Tatooine,Human
1,C-3PO,167.0,75,,gold,yellow,112BBY,,Tatooine,Droid
2,R2-D2,96.0,32,,"white, blue",red,33BBY,,Naboo,Droid
3,Darth Vader,202.0,136,none,white,yellow,41.9BBY,male,Tatooine,Human
4,Leia Organa,150.0,49,brown,light,brown,19BBY,female,Alderaan,Human


In [108]:
#crap still not equal
print(df_char_usual.equals(df_char_sql))
print(df_char_it.equals(df_char_sql))
#maybe the datatypes?

False
False


In [109]:
df_char_usual.head()

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld,species
0,Luke Skywalker,172.0,77,blond,fair,blue,19BBY,male,Tatooine,Human
1,C-3PO,167.0,75,,gold,yellow,112BBY,,Tatooine,Droid
2,R2-D2,96.0,32,,"white, blue",red,33BBY,,Naboo,Droid
3,Darth Vader,202.0,136,none,white,yellow,41.9BBY,male,Tatooine,Human
4,Leia Organa,150.0,49,brown,light,brown,19BBY,female,Alderaan,Human


In [110]:
#no, that's not it either
print(df_char_usual.dtypes)
print(df_char_sql.dtypes)

name           object
height        float64
mass           object
hair_color     object
skin_color     object
eye_color      object
birth_year     object
gender         object
homeworld      object
species        object
dtype: object
name           object
height        float64
mass           object
hair_color     object
skin_color     object
eye_color      object
birth_year     object
gender         object
homeworld      object
species        object
dtype: object


In [111]:
#well that's not the same
#it looks like maybe the sql one had a duplicate row for each?
print('The shape of the data imported the usual way is :', df_char_usual.shape)
print('The shape of the data imported the database way is :',df_char_sql.shape)

The shape of the data imported the usual way is : (87, 10)
The shape of the data imported the database way is : (87, 10)


In [112]:
#I think it would be helpful to close and dispose of the engine and then try again
#because I ran the code a few times so I think it kept re-appending
csv_database.dispose()
os.remove('csv_database.db')

In [96]:
#the above seems to have 
#closed and dispoed of the engine
print(pd.read_sql_query("SELECT * FROM 'table'", csv_database))

OperationalError: (sqlite3.OperationalError) no such table: table
[SQL: SELECT * FROM 'table']
(Background on this error at: http://sqlalche.me/e/e3q8)

In [113]:
#rebuilding the engine and redoing everything
#and then checking if everything is equal
csv_database = create_engine('sqlite:///csv_database.db')

chunksize = 10
index_offset = 1
for df in pd.read_csv(file, chunksize = chunksize, iterator = True):
      df.index += index_offset
      df.to_sql('table', csv_database, if_exists = 'append')
      index_offset += 1

In [114]:
df_char_sql1 = pd.read_sql_query("SELECT * FROM 'table'", csv_database)

df_char_sql1 = df_char_sql1 >> drop(['index'])

print(df_char_usual.equals(df_char_sql1))
print(df_char_it.equals(df_char_sql1))
#YAYAYAYAY

False
False


In [115]:
df_char_sql1.head()

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld,species
0,Luke Skywalker,172.0,77,blond,fair,blue,19BBY,male,Tatooine,Human
1,C-3PO,167.0,75,,gold,yellow,112BBY,,Tatooine,Droid
2,R2-D2,96.0,32,,"white, blue",red,33BBY,,Naboo,Droid
3,Darth Vader,202.0,136,none,white,yellow,41.9BBY,male,Tatooine,Human
4,Leia Organa,150.0,49,brown,light,brown,19BBY,female,Alderaan,Human


In [116]:
#dumping the database and rebuilding without the index_offset, and then checking that dataframes are equal
#this looks the same but it's not
#and tbh I'm too lazy to look for the differences right now
csv_database.dispose()
os.remove('csv_database.db')

csv_database = create_engine('sqlite:///csv_database.db')

chunksize = 10

for df in pd.read_csv(file, chunksize = chunksize, iterator = True):
      df.to_sql('table', csv_database, if_exists = 'append')

df_char_sql2 = pd.read_sql_query("SELECT * FROM 'table'", csv_database)

df_char_sql2 = df_char_sql2 >> drop(['index'])

print(df_char_usual.equals(df_char_sql2))
print(df_char_it.equals(df_char_sql2))
print(df_char_sql1.equals(df_char_sql2))

False
False
True


In [117]:
#looking at the unshared columns
df_char_sql1.merge(df_char_sql2, how = 'outer' ,indicator=False)

Unnamed: 0,name,height,mass,hair_color,skin_color,eye_color,birth_year,gender,homeworld,species
0,Luke Skywalker,172.0,77.0,blond,fair,blue,19BBY,male,Tatooine,Human
1,C-3PO,167.0,75.0,,gold,yellow,112BBY,,Tatooine,Droid
2,R2-D2,96.0,32.0,,"white, blue",red,33BBY,,Naboo,Droid
3,Darth Vader,202.0,136.0,none,white,yellow,41.9BBY,male,Tatooine,Human
4,Leia Organa,150.0,49.0,brown,light,brown,19BBY,female,Alderaan,Human
5,Owen Lars,178.0,120.0,"brown, grey",light,blue,52BBY,male,Tatooine,Human
6,Beru Whitesun lars,165.0,75.0,brown,light,blue,47BBY,female,Tatooine,Human
7,R5-D4,97.0,32.0,,"white, red",red,,,Tatooine,Droid
8,Biggs Darklighter,183.0,84.0,black,light,brown,24BBY,male,Tatooine,Human
9,Obi-Wan Kenobi,182.0,77.0,"auburn, white",fair,blue-gray,57BBY,male,Stewjon,Human


In [None]:
df_char_sql1.replace(' ', 'LLLLLLL', regex=True)
#it doesn't seem to be whitespaces

In [None]:
df_char_sql2.replace(' ', 'LLLLLLL', regex=True)

In [None]:
print(df_char_sql2.shape)
print(df_char_sql1.shape)

In [None]:
print((df_char_sql2.replace(' ', 'LLLLLLL', regex=True)).equals((df_char_sql1.replace(' ', 'LLLLLLL', regex=True))))