In [92]:
#purpose:  Guide to Data Cleansing then Bulk Loading Data (ETL) into MySQL using python
import pandas as pd #
import numpy as np

In [93]:
'''
source: https://realpython.com/python-data-cleaning-numpy-pandas/
In this tutorial, we’ll leverage Python’s Pandas and NumPy libraries to clean data.

We’ll cover the following:

Dropping unnecessary columns in a DataFrame
Changing the index of a DataFrame
Using .str() methods to clean columns
Using the DataFrame.applymap() function to clean the entire dataset, element-wise
Renaming columns to a more recognizable set of labels
Skipping unnecessary rows in a CSV file
'''
#read csv file into python using pandas
df = pd.read_csv('C:\\ODSA\\data\\BL-Flickr-Images-Book.csv')
df.head() #explore dataframe column contents


Unnamed: 0,Identifier,Edition Statement,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Corporate Author,Corporate Contributors,Former owner,Engraver,Issuance type,Flickr URL,Shelfmarks
0,206,,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12641.b.30.
1,216,,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12626.cc.2.
2,218,,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12625.dd.1.
3,472,,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.","Appleyard, Ernest Silvanus.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 10369.bbb.15.
4,480,"A new edition, revised, etc.",London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 9007.d.28.


In [94]:
#Dropping Columns in a DataFrame
col_to_drop = ['Edition Statement',
            'Corporate Author',
            'Corporate Contributors',
          'Former owner',
            'Engraver',
            'Contributors',
           'Issuance type',
            'Shelfmarks']

df.drop(col_to_drop, inplace=True, axis=1)
df.head()

Unnamed: 0,Identifier,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
0,206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
1,216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
2,218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
3,472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
4,480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [95]:
#create index filed, based on unidue values
if df['Identifier'].is_unique: #check if column is unique
    print('pandas dataframe field is unique')
    df = df.set_index('Identifier')
    print(df.loc[206]) # can now reference bt index value
df.head()

pandas dataframe field is unique
Place of Publication                                               London
Date of Publication                                           1879 [1878]
Publisher                                                S. Tinsley & Co.
Title                                   Walter Forbes. [A novel.] By A. A
Author                                                              A. A.
Flickr URL              http://www.flickr.com/photos/britishlibrary/ta...
Name: 206, dtype: object


Unnamed: 0_level_0,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [96]:
#Tidying up Fields in the Data
'''
clean specific columns and get them to a uniform format to get a better 
understanding of the dataset and enforce consistency
'''
df.get_dtype_counts() #get data type counts

object    6
dtype: int64

In [97]:
#change fist 4 characters of "date of publication" field to numeric, to use in future calculations
extr = df['Date of Publication'].str.extract(r'^(\d{4})', expand=False)
df['Date of Publication'] = pd.to_numeric(extr)
df.get_dtype_counts() #get data type counts

object     5
float64    1
dtype: int64

In [98]:
#Combining str Methods with NumPy to Clean Columns
#np.where function, which is basically a vectorized form of Excel’s IF() macro.
#np.where(condition1, x1, 
#        np.where(condition2, x2, 
#            np.where(condition3, x3, ...))
df['Place of Publication'].head(10)
#to clean this column in one sweep, we can use str.contains() to get a boolean mask.
pub = df['Place of Publication']
london = pub.str.contains('London')
print(london[:5]) # print first 5 rows of dataframe
#replace hypen "-" with blank character '' 
oxford = pub.str.contains('Oxford')
df['Place of Publication'] = np.where(london, 'London',
                                      np.where(oxford, 'Oxford',
                                               pub.str.replace('-', ' ')))
df.head()

Identifier
206    True
216    True
218    True
472    True
480    True
Name: Place of Publication, dtype: bool


Unnamed: 0_level_0,Place of Publication,Date of Publication,Publisher,Title,Author,Flickr URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206,London,1879.0,S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
216,London,1868.0,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
218,London,1869.0,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
472,London,1851.0,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
480,London,1857.0,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [99]:
#replace multiple values in a pandas dataframe by applying the applymap() function
# create callable function to apply to each data element - 
#see https://chrisalbon.com/python/data_wrangling/pandas_apply_operations_to_dataframes/ for more details
def chg_author(item):
    if type(item) is not str:
        return item
    if 'A., A. A.' in item:
        return 'J.Bergmann'
    else:
        return item
cleaned_df = df.applymap(chg_author)
print(cleaned_df.head())

           Place of Publication  Date of Publication              Publisher  \
Identifier                                                                    
206                      London               1879.0       S. Tinsley & Co.   
216                      London               1868.0           Virtue & Co.   
218                      London               1869.0  Bradbury, Evans & Co.   
472                      London               1851.0          James Darling   
480                      London               1857.0   Wertheim & Macintosh   

                                                        Title      Author  \
Identifier                                                                  
206                         Walter Forbes. [A novel.] By A. A       A. A.   
216         All for Greed. [A novel. The dedication signed...  J.Bergmann   
218         Love the Avenger. By the author of “All for Gr...  J.Bergmann   
472         Welsh Sketches, chiefly ecclesiastical, to the...

In [104]:
#Export Resulting "Cleaned" dataset to CSV file.  Note: remove csv header for import into MySQL
cleaned_df.to_csv('C:\\ODSA\\data\\BL-Flickr-Images-Book-Cleaned.csv',header=False,encoding='utf-8') #export in utf-8 encoding - helps with later import!

In [101]:
#purpose:  Python Bulk Loading Data (ETL) into MySQL with Python
#Insert "Cleaned" csv file into MySQL database table
import pymysql #pymysql package to connect to MySQL and execute SQL statements with Python
import csv #Python Core Package - import vsv file from local path

mydb = pymysql.connect(host='localhost', user='root', passwd='root', db='test')
csv_data = csv.reader(open('data/BL-Flickr-Images-Book-Cleaned.csv',encoding='utf-8'))

In [102]:
#import each row of the csv file in to the corresponding database/table in MySQL
cursor = mydb.cursor()
for row in csv_data:
    cursor.execute('INSERT INTO etl_books(Identifier, `Place of Publication`, `Date of Publication`, Publisher, Title, Author, `Flickr URL`)' 'VALUES(%s,%s,%s,%s,%s,%s,%s)',row)
mydb.commit()

In [103]:
#close msql data stream
cursor.close()
print("Imported!")

SyntaxError: invalid syntax (<ipython-input-103-e201f8dc407f>, line 12)