# Lecture 1031: Cleaning up and 'normalizing' string columns

In [2]:
import pandas as pd
import re # this is python's regex library

## Import data

Note that I'm importing `export.csv`, which I saved from Excel. I'm doing this solely to show you how to clean up strings. However, if you import the Excel file, the `AMOUNT` is clearly typed as a float.

In [4]:
# data_from_excel = pd.read_csv('newsom_contribs.csv',
#     dtype={
#         'ID NUMBER' : object
#     },
#     parse_dates=['TRANSACTION DATE', 'FILED DATE']
# )
# data_from_excel

In [3]:
data = pd.read_csv('export.csv',
    dtype={
        'ID NUMBER' : object
    },
    parse_dates=['TRANSACTION DATE', 'FILED DATE']
)
data

Unnamed: 0,NAME OF CONTRIBUTOR,PAYMENT TYPE,CITY,STATE,ZIP,ID NUMBER,EMPLOYER,OCCUPATION,AMOUNT,TRANSACTION DATE,FILED DATE,TRANSACTION NUMBER
0,CALIFORNIA CHIROPRACTIC ASSOCIATION PAC,MONETARY,SACRAMENTO,CA,95814,742986,,,"($15,000.00)",2021-09-07,2022-05-20,2625145 - EXP2027
1,MARINA BILAVER,MONETARY,HOLLYWOOD,CA,90028,,,NOT EMPLOYED,$10.00,2021-07-01,2022-05-20,2625145 - IDT41416
2,CAMERON BLOOMER,MONETARY,NEW YORK,NY,10010,,BLOOMER BIOTECH,INVESTMENT ADVISOR,$100.00,2021-07-01,2022-05-20,2625145 - IDT41417
3,ALISON FLEMMING,MONETARY,LARKSPUR,CA,94904,,COOPER & MCCLOSKEY,INSURANCE BROKER,$150.00,2021-07-01,2022-05-20,2625145 - IDT41424
4,WILLIAM FOWKES,MONETARY,LA HONDA,CA,94020,,,NOT EMPLOYED,$25.00,2021-07-01,2022-05-20,2625145 - IDT41425
...,...,...,...,...,...,...,...,...,...,...,...,...
23381,"1-800 CONTACTS, INC.",MONETARY,DRAPER,UT,84020,,,,"$10,000.00",2021-12-22,2022-06-06,2644331 - INC3631
23382,"T-MOBILE USA, INC.",MONETARY,BELLEVUE,WA,98006,,,,"$5,000.00",2021-12-22,2022-06-06,2644331 - INC3632
23383,"GOOGLE, LLC AND AFFILIATED ENTITIES",MONETARY,MOUNTAIN VIEW,CA,94043,,,,"$32,400.00",2021-12-27,2022-06-06,2644331 - INC3634
23384,AIRLINES FOR AMERICA,MONETARY,WASHINGTON,DC,20004,,,,"$10,000.00",2021-12-28,2022-06-06,2644331 - INC3642


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23386 entries, 0 to 23385
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   NAME OF CONTRIBUTOR  23386 non-null  object        
 1   PAYMENT TYPE         23386 non-null  object        
 2   CITY                 23383 non-null  object        
 3   STATE                23368 non-null  object        
 4   ZIP                  23376 non-null  object        
 5   ID NUMBER            149 non-null    object        
 6   EMPLOYER             10234 non-null  object        
 7   OCCUPATION           23017 non-null  object        
 8   AMOUNT               23386 non-null  object        
 9   TRANSACTION DATE     23386 non-null  datetime64[ns]
 10  FILED DATE           23386 non-null  datetime64[ns]
 11  TRANSACTION NUMBER   23386 non-null  object        
dtypes: datetime64[ns](2), object(10)
memory usage: 2.1+ MB


POLL: What are you noticing about the `AMOUNT` column? [https://pollev.com/soooh](https://pollev.com/soooh)

## Clean up `AMOUNT` column

In [7]:
data.head()

Unnamed: 0,NAME OF CONTRIBUTOR,PAYMENT TYPE,CITY,STATE,ZIP,ID NUMBER,EMPLOYER,OCCUPATION,AMOUNT,TRANSACTION DATE,FILED DATE,TRANSACTION NUMBER
0,CALIFORNIA CHIROPRACTIC ASSOCIATION PAC,MONETARY,SACRAMENTO,CA,95814,742986.0,,,"($15,000.00)",2021-09-07,2022-05-20,2625145 - EXP2027
1,MARINA BILAVER,MONETARY,HOLLYWOOD,CA,90028,,,NOT EMPLOYED,$10.00,2021-07-01,2022-05-20,2625145 - IDT41416
2,CAMERON BLOOMER,MONETARY,NEW YORK,NY,10010,,BLOOMER BIOTECH,INVESTMENT ADVISOR,$100.00,2021-07-01,2022-05-20,2625145 - IDT41417
3,ALISON FLEMMING,MONETARY,LARKSPUR,CA,94904,,COOPER & MCCLOSKEY,INSURANCE BROKER,$150.00,2021-07-01,2022-05-20,2625145 - IDT41424
4,WILLIAM FOWKES,MONETARY,LA HONDA,CA,94020,,,NOT EMPLOYED,$25.00,2021-07-01,2022-05-20,2625145 - IDT41425


In [8]:
# replace commas with nothing
data['AMOUNT'] = data['AMOUNT'].str.replace(',', '')

# replace dollar sign with nothing
data['AMOUNT'] = data['AMOUNT'].str.replace('$', '')

  data['AMOUNT'] = data['AMOUNT'].str.replace('$', '')


In [10]:
data

Unnamed: 0,NAME OF CONTRIBUTOR,PAYMENT TYPE,CITY,STATE,ZIP,ID NUMBER,EMPLOYER,OCCUPATION,AMOUNT,TRANSACTION DATE,FILED DATE,TRANSACTION NUMBER
0,CALIFORNIA CHIROPRACTIC ASSOCIATION PAC,MONETARY,SACRAMENTO,CA,95814,742986,,,(15000.00),2021-09-07,2022-05-20,2625145 - EXP2027
1,MARINA BILAVER,MONETARY,HOLLYWOOD,CA,90028,,,NOT EMPLOYED,10.00,2021-07-01,2022-05-20,2625145 - IDT41416
2,CAMERON BLOOMER,MONETARY,NEW YORK,NY,10010,,BLOOMER BIOTECH,INVESTMENT ADVISOR,100.00,2021-07-01,2022-05-20,2625145 - IDT41417
3,ALISON FLEMMING,MONETARY,LARKSPUR,CA,94904,,COOPER & MCCLOSKEY,INSURANCE BROKER,150.00,2021-07-01,2022-05-20,2625145 - IDT41424
4,WILLIAM FOWKES,MONETARY,LA HONDA,CA,94020,,,NOT EMPLOYED,25.00,2021-07-01,2022-05-20,2625145 - IDT41425
...,...,...,...,...,...,...,...,...,...,...,...,...
23381,"1-800 CONTACTS, INC.",MONETARY,DRAPER,UT,84020,,,,10000.00,2021-12-22,2022-06-06,2644331 - INC3631
23382,"T-MOBILE USA, INC.",MONETARY,BELLEVUE,WA,98006,,,,5000.00,2021-12-22,2022-06-06,2644331 - INC3632
23383,"GOOGLE, LLC AND AFFILIATED ENTITIES",MONETARY,MOUNTAIN VIEW,CA,94043,,,,32400.00,2021-12-27,2022-06-06,2644331 - INC3634
23384,AIRLINES FOR AMERICA,MONETARY,WASHINGTON,DC,20004,,,,10000.00,2021-12-28,2022-06-06,2644331 - INC3642


In [13]:
# replace right parenthesis with nothing
data['AMOUNT'] = data['AMOUNT'].str.replace(')', '')

# replace left parenthesis with negative!
data['AMOUNT'] = data['AMOUNT'].str.replace('(', '-')

  data['AMOUNT'] = data['AMOUNT'].str.replace(')', '')
  data['AMOUNT'] = data['AMOUNT'].str.replace('(', '-')


In [14]:
data

Unnamed: 0,NAME OF CONTRIBUTOR,PAYMENT TYPE,CITY,STATE,ZIP,ID NUMBER,EMPLOYER,OCCUPATION,AMOUNT,TRANSACTION DATE,FILED DATE,TRANSACTION NUMBER
0,CALIFORNIA CHIROPRACTIC ASSOCIATION PAC,MONETARY,SACRAMENTO,CA,95814,742986,,,-15000.00,2021-09-07,2022-05-20,2625145 - EXP2027
1,MARINA BILAVER,MONETARY,HOLLYWOOD,CA,90028,,,NOT EMPLOYED,10.00,2021-07-01,2022-05-20,2625145 - IDT41416
2,CAMERON BLOOMER,MONETARY,NEW YORK,NY,10010,,BLOOMER BIOTECH,INVESTMENT ADVISOR,100.00,2021-07-01,2022-05-20,2625145 - IDT41417
3,ALISON FLEMMING,MONETARY,LARKSPUR,CA,94904,,COOPER & MCCLOSKEY,INSURANCE BROKER,150.00,2021-07-01,2022-05-20,2625145 - IDT41424
4,WILLIAM FOWKES,MONETARY,LA HONDA,CA,94020,,,NOT EMPLOYED,25.00,2021-07-01,2022-05-20,2625145 - IDT41425
...,...,...,...,...,...,...,...,...,...,...,...,...
23381,"1-800 CONTACTS, INC.",MONETARY,DRAPER,UT,84020,,,,10000.00,2021-12-22,2022-06-06,2644331 - INC3631
23382,"T-MOBILE USA, INC.",MONETARY,BELLEVUE,WA,98006,,,,5000.00,2021-12-22,2022-06-06,2644331 - INC3632
23383,"GOOGLE, LLC AND AFFILIATED ENTITIES",MONETARY,MOUNTAIN VIEW,CA,94043,,,,32400.00,2021-12-27,2022-06-06,2644331 - INC3634
23384,AIRLINES FOR AMERICA,MONETARY,WASHINGTON,DC,20004,,,,10000.00,2021-12-28,2022-06-06,2644331 - INC3642


## String cleaning

But first, let's talk about regex, or regular expressions.

We'll go through some of the lessons here: https://regexone.com/

## String normalizing

The following code replaces 2+ whitespaces with 1 whitespace:

```python
df['column_name'] = df['column_name'].str.replace(r'[ ]+', ' ', regex=True)
```

Remove leading and trailing whitespaces:

```python
df['column_name'] = df['column_name'].str.strip()
```

Uppercase a column (you don't have to do for this particular dataset):

```python
df['column_name'] = df['column_name'].str.upper()
```


POLL: If I'm trying to get the count and amount of corporate contributions to Newsom, how would I do this?
[https://pollev.com/soooh](https://pollev.com/soooh)