# EDA-Cleaning: Part I

In [1]:
import os
import sys
import csv
import pandas as pd

from io import StringIO

- Starting functions to handle pandas.read_csv errors
- To be refactored and extended
- Possible to eventually create a class/object to extend pandas 'on_bad_lines' error handling?

In [2]:
'''Begining development on pandas csv error handling functions'''

def pandas_bad_lines_capture(file: str):
    ### adapted from http://yilmazturk.info/2019/09/02/pandas-bad-line-reporter/
    '''On bad lines capture and save bad lines for later cleaning. Returns data frame with bad lines dropped'''
    original_stderr = sys.stderr
    temp_stderr = StringIO()
    sys.stderr = temp_stderr
    
    d = pd.read_csv(file, on_bad_lines='warn')
    
    sys.stderr = original_stderr
    e_captured = temp_stderr.getvalue()
    
    error_out = file + '_bad_lines.txt'
    with open(error_out, 'w') as bad_lines:
        for line in e_captured.split(r'\n'):
            bad_lines.write(line)
            bad_lines.write('\n')
            
    print('Pandas encountered errors with ' + file + ', please check ' + error_out
          + ' for details')
    
    return d

def get_error_strings(file:str)-> list:
    '''Read Error File Lines into a list'''
    lst = []
    with open(file, 'r') as f:
        for line in f:
            if 'Skipping' in line:
                lst.append(line.strip())
    return lst

def get_error_lines_dict(lst: list)-> dict:
    '''Parse Error Lines into a dict with the correct line number as a key'''
    first_message = len('Skipping line ')
    dict_ = {}
    for line in lst:
        error_line_start = line.find('Sk') + first_message
        error_line_end = line.find(':')
        error_line_num = int(line[error_line_start:error_line_end]) -1
        error_message = line[error_line_end + 1:]
        dict_[error_line_num] = error_message
    return dict_   

def get_bad_lines(file: str, e_lines: dict)->tuple[list, list]:
    header = None
    bad_lines = []
    keys = list(e_lines.keys())
    
    with open(file, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, 'unix')
        for i, row in enumerate(reader):
            if i == 0:
                header = row
            elif i in keys:
                bad_lines.append(row)
    return header, bad_lines

### We'll start by checking the current contents of our project folder...

In [3]:
os.listdir()

['.git',
 '.ipynb_checkpoints',
 'books_rating_project_EDA_1.ipynb',
 'books_rating_project_EDA_2.ipynb',
 'data',
 'README.md']

### And our data folder

In [4]:
os.listdir('data')

['uncleaned_books.csv',
 'uncleaned_books.csv_bad_lines.txt',
 'working_books.csv']

### Let's create a couple of variables to make dealing with filepaths a bit easier

In [5]:
folder = 'data/'
ratings_csv = 'uncleaned_books.csv'
ratings_path = folder + ratings_csv

### First attempt at reading the csv into pandas

In [6]:
df_books_ratings = pd.read_csv(ratings_path)

ParserError: Error tokenizing data. C error: Expected 12 fields in line 3350, saw 13


### Let's use the first error handling function to open the file for a bit of exploration and record the bad lines for later cleaning

In [7]:
df_books_ratings = pandas_bad_lines_capture(ratings_path)

Pandas encountered errors with data/uncleaned_books.csv, please check data/uncleaned_books.csv_bad_lines.txt for details


In [8]:
df_books_ratings.head(5)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


* interesting to note that books with multiple authors seem to be seperated by a '/' 

In [9]:
df_books_ratings.shape

(11123, 12)

In [10]:
df_books_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11123 entries, 0 to 11122
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   bookID              11123 non-null  int64  
 1   title               11123 non-null  object 
 2   authors             11123 non-null  object 
 3   average_rating      11123 non-null  float64
 4   isbn                11123 non-null  object 
 5   isbn13              11123 non-null  int64  
 6   language_code       11123 non-null  object 
 7     num_pages         11123 non-null  int64  
 8   ratings_count       11123 non-null  int64  
 9   text_reviews_count  11123 non-null  int64  
 10  publication_date    11123 non-null  object 
 11  publisher           11123 non-null  object 
dtypes: float64(1), int64(5), object(6)
memory usage: 1.0+ MB


### Preliminarily it looks like our remaining data is clean. Let's circle back and deal with our bad lines before proceeding with exploration and further cleaning

### Let's see how many lines caused trouble during the initial load of the csv

In [11]:
os.listdir('data')

['uncleaned_books.csv',
 'uncleaned_books.csv_bad_lines.txt',
 'working_books.csv']

In [12]:
errors_file = 'uncleaned_books.csv_bad_lines.txt'
errors_path = folder + errors_file
errors_path

'data/uncleaned_books.csv_bad_lines.txt'

In [13]:
with open(errors_path, 'r') as errors:
    count = 0
    for line in errors.readlines():
        count += 1

In [14]:
count

6

### With 6 errors at most, we'll look at the errors inline before processing further

In [15]:
with open(errors_path, 'r') as errors:
    for line in errors.readlines():
        print(line)

b'Skipping line 3350: expected 12 fields, saw 13

Skipping line 4704: expected 12 fields, saw 13

Skipping line 5879: expected 12 fields, saw 13

Skipping line 8981: expected 12 fields, saw 13

'





### Process the errors text file to make it easier to pull the bad lines out of the original csv

In [16]:
lst_errors = get_error_strings(errors_path)
lst_errors

["b'Skipping line 3350: expected 12 fields, saw 13",
 'Skipping line 4704: expected 12 fields, saw 13',
 'Skipping line 5879: expected 12 fields, saw 13',
 'Skipping line 8981: expected 12 fields, saw 13']

In [17]:
dict_errors = get_error_lines_dict(lst_errors)
dict_errors

{3349: ' expected 12 fields, saw 13',
 4703: ' expected 12 fields, saw 13',
 5878: ' expected 12 fields, saw 13',
 8980: ' expected 12 fields, saw 13'}

### And finally retrieve the bad lines from the original file

In [18]:
lst_header, lst_bad_lines = get_bad_lines(ratings_path, dict_errors)

In [19]:
lst_header

['bookID',
 'title',
 'authors',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 '  num_pages',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher']

In [20]:
lst_bad_lines

[['12224',
  'Streetcar Suburbs: The Process of Growth in Boston  1870-1900',
  'Sam Bass Warner',
  ' Jr./Sam B. Warner',
  '3.58',
  '0674842111',
  '9780674842113',
  'en-US',
  '236',
  '61',
  '6',
  '4/20/2004',
  'Harvard University Press'],
 ['16914',
  "The Tolkien Fan's Medieval Reader",
  'David E. Smith (Turgon of TheOneRing.net',
  ' one of the founding members of this Tolkien website)/Verlyn Flieger/Turgon (=David E. Smith)',
  '3.58',
  '1593600119',
  '9781593600112',
  'eng',
  '400',
  '26',
  '4',
  '4/6/2004',
  'Cold Spring Press'],
 ['22128',
  'Patriots (The Coming Collapse)',
  'James Wesley',
  ' Rawles',
  '3.63',
  '156384155X',
  '9781563841552',
  'eng',
  '342',
  '38',
  '4',
  '1/15/1999',
  'Huntington House Publishers'],
 ['34889',
  "Brown's Star Atlas: Showing All The Bright Stars With Full Instructions How To Find And Use Them For Navigational Purposes And Department Of Trade Examinations.",
  'Brown',
  ' Son & Ferguson',
  '0.00',
  '0851742718',


### A cursory review of the bad lines looks like the authors were split across two lines

In [21]:
lst_aut = []
for lst in lst_bad_lines:
    aut = lst[2] + lst[3]
    lst_aut.append(aut)

In [22]:
lst_aut

['Sam Bass Warner Jr./Sam B. Warner',
 'David E. Smith (Turgon of TheOneRing.net one of the founding members of this Tolkien website)/Verlyn Flieger/Turgon (=David E. Smith)',
 'James Wesley Rawles',
 'Brown Son & Ferguson']

In [23]:
temp = lst_aut[3].split()
temp

['Brown', 'Son', '&', 'Ferguson']

In [24]:
temp = [elm for elm in temp if elm != '&']
temp

['Brown', 'Son', 'Ferguson']

In [25]:
temp = '/'.join(temp)
temp

'Brown/Son/Ferguson'

In [26]:
lst_aut.remove(lst_aut[3])
lst_aut.append(temp)
lst_aut

['Sam Bass Warner Jr./Sam B. Warner',
 'David E. Smith (Turgon of TheOneRing.net one of the founding members of this Tolkien website)/Verlyn Flieger/Turgon (=David E. Smith)',
 'James Wesley Rawles',
 'Brown/Son/Ferguson']

In [27]:
for i, lst in enumerate(lst_bad_lines):
    lst.remove(lst[2])
    lst.remove(lst[2])
    lst.insert(2, lst_aut[i])

lst_bad_lines

[['12224',
  'Streetcar Suburbs: The Process of Growth in Boston  1870-1900',
  'Sam Bass Warner Jr./Sam B. Warner',
  '3.58',
  '0674842111',
  '9780674842113',
  'en-US',
  '236',
  '61',
  '6',
  '4/20/2004',
  'Harvard University Press'],
 ['16914',
  "The Tolkien Fan's Medieval Reader",
  'David E. Smith (Turgon of TheOneRing.net one of the founding members of this Tolkien website)/Verlyn Flieger/Turgon (=David E. Smith)',
  '3.58',
  '1593600119',
  '9781593600112',
  'eng',
  '400',
  '26',
  '4',
  '4/6/2004',
  'Cold Spring Press'],
 ['22128',
  'Patriots (The Coming Collapse)',
  'James Wesley Rawles',
  '3.63',
  '156384155X',
  '9781563841552',
  'eng',
  '342',
  '38',
  '4',
  '1/15/1999',
  'Huntington House Publishers'],
 ['34889',
  "Brown's Star Atlas: Showing All The Bright Stars With Full Instructions How To Find And Use Them For Navigational Purposes And Department Of Trade Examinations.",
  'Brown/Son/Ferguson',
  '0.00',
  '0851742718',
  '9780851742717',
  'eng'

In [28]:
fixed_lines_dict = {}

In [29]:
for elm in lst_header:
    fixed_lines_dict[elm]= []

In [30]:
for lst in lst_bad_lines:
    for i, elm in enumerate(lst):
        key = lst_header[i]
        fixed_lines_dict[key].append(elm)

In [31]:
fixed_lines_dict

{'bookID': ['12224', '16914', '22128', '34889'],
 'title': ['Streetcar Suburbs: The Process of Growth in Boston  1870-1900',
  "The Tolkien Fan's Medieval Reader",
  'Patriots (The Coming Collapse)',
  "Brown's Star Atlas: Showing All The Bright Stars With Full Instructions How To Find And Use Them For Navigational Purposes And Department Of Trade Examinations."],
 'authors': ['Sam Bass Warner Jr./Sam B. Warner',
  'David E. Smith (Turgon of TheOneRing.net one of the founding members of this Tolkien website)/Verlyn Flieger/Turgon (=David E. Smith)',
  'James Wesley Rawles',
  'Brown/Son/Ferguson'],
 'average_rating': ['3.58', '3.58', '3.63', '0.00'],
 'isbn': ['0674842111', '1593600119', '156384155X', '0851742718'],
 'isbn13': ['9780674842113',
  '9781593600112',
  '9781563841552',
  '9780851742717'],
 'language_code': ['en-US', 'eng', 'eng', 'eng'],
 '  num_pages': ['236', '400', '342', '49'],
 'ratings_count': ['61', '26', '38', '0'],
 'text_reviews_count': ['6', '4', '4', '0'],
 'pu

In [32]:
df_fixed_lines = pd.DataFrame(fixed_lines_dict)
df_fixed_lines.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,12224,Streetcar Suburbs: The Process of Growth in Bo...,Sam Bass Warner Jr./Sam B. Warner,3.58,0674842111,9780674842113,en-US,236,61,6,4/20/2004,Harvard University Press
1,16914,The Tolkien Fan's Medieval Reader,David E. Smith (Turgon of TheOneRing.net one o...,3.58,1593600119,9781593600112,eng,400,26,4,4/6/2004,Cold Spring Press
2,22128,Patriots (The Coming Collapse),James Wesley Rawles,3.63,156384155X,9781563841552,eng,342,38,4,1/15/1999,Huntington House Publishers
3,34889,Brown's Star Atlas: Showing All The Bright Sta...,Brown/Son/Ferguson,0.0,0851742718,9780851742717,eng,49,0,0,5/1/1977,Brown Son & Ferguson Ltd.


In [33]:
df_new = pd.concat([df_books_ratings, df_fixed_lines])
df_new.head()

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.
2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,0439554896,9780439554893,eng,352,6333,244,11/1/2003,Scholastic
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.
4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,0439682584,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic


In [34]:
df_new.shape

(11127, 12)

In [35]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11127 entries, 0 to 3
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   bookID              11127 non-null  object
 1   title               11127 non-null  object
 2   authors             11127 non-null  object
 3   average_rating      11127 non-null  object
 4   isbn                11127 non-null  object
 5   isbn13              11127 non-null  object
 6   language_code       11127 non-null  object
 7     num_pages         11127 non-null  object
 8   ratings_count       11127 non-null  object
 9   text_reviews_count  11127 non-null  object
 10  publication_date    11127 non-null  object
 11  publisher           11127 non-null  object
dtypes: object(12)
memory usage: 1.1+ MB


In [36]:
working_books_path = folder + 'working_books.csv'

In [37]:
df_new.to_csv(working_books_path, encoding='utf-8', index=False)

In [38]:
os.listdir('data')

['uncleaned_books.csv',
 'uncleaned_books.csv_bad_lines.txt',
 'working_books.csv']