# Reading Files

In [59]:
import pandas as pd
from openpyxl.workbook import Workbook

In [60]:
df_excel = pd.read_excel('data/regions.xlsx')
df_csv = pd.read_csv('data/Names.csv', header=None)
df_txt = pd.read_csv('data/data.txt', delimiter='\t')

In [61]:
df_excel

Unnamed: 0,Region,Units,Sales,Export
0,South,54,332,100
1,North,20,110,50
2,East,36,224,85
3,West,60,400,110
4,West,50,226,65
5,North,84,470,150


In [62]:
df_txt

Unnamed: 0,ID,EGF_Baseline,EGF_Stimulus
0,FBgn0029994,-1.25,-0.27
1,FBgn0037191,-1.05,0.78
2,FBgn0036810,2.08,1.34
3,FBgn0033320,1.15,0.45
4,FBgn0051156,-1.77,-0.76
...,...,...,...
11761,FBgn0026136,0.57,1.23
11762,FBgn0037356,1.14,-0.95
11763,FBgn0038214,-1.86,-0.67
11764,FBgn0042110,1.49,0.43


In [63]:
df_csv

Unnamed: 0,0,1,2,3,4,5,6
0,John,Doe,120 jefferson st.,Riverside,NJ,8074,45000
1,Jack,McGinnis,220 hobo Av.,Phila,PA,9119,18000
2,"John ""Da Man""",Repici,120 jefferson st.,Riverside,NJ,8075,120000
3,Stephen,Tyler,"7452 Terrance ""At the Plaza"" road",SomeTown,SD,91234,90000
4,,Blankman,,SomeTown,SD,298,30000
5,"Joan ""Danger"", Anne",Jet,"9th, at Terrace plc",Desert City,CO,123,68000


In [64]:
df_csv.columns = ['First Name', 'Last Name', 'Address', 'City', 'State', 'Area Code', 'Income']

In [65]:
df_csv

Unnamed: 0,First Name,Last Name,Address,City,State,Area Code,Income
0,John,Doe,120 jefferson st.,Riverside,NJ,8074,45000
1,Jack,McGinnis,220 hobo Av.,Phila,PA,9119,18000
2,"John ""Da Man""",Repici,120 jefferson st.,Riverside,NJ,8075,120000
3,Stephen,Tyler,"7452 Terrance ""At the Plaza"" road",SomeTown,SD,91234,90000
4,,Blankman,,SomeTown,SD,298,30000
5,"Joan ""Danger"", Anne",Jet,"9th, at Terrace plc",Desert City,CO,123,68000


In [66]:
df_csv.to_excel('output/modified.xlsx')

# Viewing Files

In [67]:
df_csv.columns

Index(['First Name', 'Last Name', 'Address', 'City', 'State', 'Area Code',
       'Income'],
      dtype='object')

In [68]:
df_csv['Last Name']

0         Doe
1    McGinnis
2      Repici
3       Tyler
4    Blankman
5         Jet
Name: Last Name, dtype: object

In [69]:
df_csv[['State', 'Area Code']]

Unnamed: 0,State,Area Code
0,NJ,8074
1,PA,9119
2,NJ,8075
3,SD,91234
4,SD,298
5,CO,123


In [70]:
df_csv['First Name'][0:3]

0             John
1             Jack
2    John "Da Man"
Name: First Name, dtype: object

In [71]:
df_csv.iloc[1]

First Name            Jack
Last Name         McGinnis
Address       220 hobo Av.
City                 Phila
State                   PA
Area Code             9119
Income               18000
Name: 1, dtype: object

In [72]:
df_csv.iloc[2, 1]

'Repici'

In [73]:
wanted_values = df_csv[['First Name', 'Last Name', 'State']]
wanted_values.to_excel('output/state_location.xlsx', index=None)

# Filter and Sort

In [74]:
df_csv

Unnamed: 0,First Name,Last Name,Address,City,State,Area Code,Income
0,John,Doe,120 jefferson st.,Riverside,NJ,8074,45000
1,Jack,McGinnis,220 hobo Av.,Phila,PA,9119,18000
2,"John ""Da Man""",Repici,120 jefferson st.,Riverside,NJ,8075,120000
3,Stephen,Tyler,"7452 Terrance ""At the Plaza"" road",SomeTown,SD,91234,90000
4,,Blankman,,SomeTown,SD,298,30000
5,"Joan ""Danger"", Anne",Jet,"9th, at Terrace plc",Desert City,CO,123,68000


In [75]:
df_csv.loc[df_csv['City'] == 'Riverside']

Unnamed: 0,First Name,Last Name,Address,City,State,Area Code,Income
0,John,Doe,120 jefferson st.,Riverside,NJ,8074,45000
2,"John ""Da Man""",Repici,120 jefferson st.,Riverside,NJ,8075,120000


In [76]:
df_csv.loc[(df_csv['City'] == 'Riverside') & (df_csv['First Name'] == 'John')]

Unnamed: 0,First Name,Last Name,Address,City,State,Area Code,Income
0,John,Doe,120 jefferson st.,Riverside,NJ,8074,45000


In [77]:
df_csv['Tax %'] = df_csv['Income'].apply(lambda x: .15 if 10000 < x < 40000 else .2 if 40000 < x < 80000 else .25)
df_csv

Unnamed: 0,First Name,Last Name,Address,City,State,Area Code,Income,Tax %
0,John,Doe,120 jefferson st.,Riverside,NJ,8074,45000,0.2
1,Jack,McGinnis,220 hobo Av.,Phila,PA,9119,18000,0.15
2,"John ""Da Man""",Repici,120 jefferson st.,Riverside,NJ,8075,120000,0.25
3,Stephen,Tyler,"7452 Terrance ""At the Plaza"" road",SomeTown,SD,91234,90000,0.25
4,,Blankman,,SomeTown,SD,298,30000,0.15
5,"Joan ""Danger"", Anne",Jet,"9th, at Terrace plc",Desert City,CO,123,68000,0.2


In [78]:
df_csv['Taxes Owed'] = df_csv['Income'] * df_csv['Tax %']
df_csv['Taxes Owed']

0     9000.0
1     2700.0
2    30000.0
3    22500.0
4     4500.0
5    13600.0
Name: Taxes Owed, dtype: float64

In [79]:
to_drop = ['Area Code', 'First Name', 'Address']
df_csv.drop(columns=to_drop, inplace=True)
df_csv

Unnamed: 0,Last Name,City,State,Income,Tax %,Taxes Owed
0,Doe,Riverside,NJ,45000,0.2,9000.0
1,McGinnis,Phila,PA,18000,0.15,2700.0
2,Repici,Riverside,NJ,120000,0.25,30000.0
3,Tyler,SomeTown,SD,90000,0.25,22500.0
4,Blankman,SomeTown,SD,30000,0.15,4500.0
5,Jet,Desert City,CO,68000,0.2,13600.0


In [80]:
df_csv['Test Col'] = False

In [81]:
df_csv.loc[df_csv['Income'] < 60000, 'Test Col'] = True
df_csv

Unnamed: 0,Last Name,City,State,Income,Tax %,Taxes Owed,Test Col
0,Doe,Riverside,NJ,45000,0.2,9000.0,True
1,McGinnis,Phila,PA,18000,0.15,2700.0,True
2,Repici,Riverside,NJ,120000,0.25,30000.0,False
3,Tyler,SomeTown,SD,90000,0.25,22500.0,False
4,Blankman,SomeTown,SD,30000,0.15,4500.0,True
5,Jet,Desert City,CO,68000,0.2,13600.0,False


In [82]:
df_csv.groupby(['Test Col']).mean()

Unnamed: 0_level_0,Income,Tax %,Taxes Owed
Test Col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,92666.666667,0.233333,22033.333333
True,31000.0,0.166667,5400.0


In [83]:
df_csv.groupby(['Test Col']).mean().sort_values('Income')

Unnamed: 0_level_0,Income,Tax %,Taxes Owed
Test Col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
True,31000.0,0.166667,5400.0
False,92666.666667,0.233333,22033.333333


# Cleaning

In [84]:
import numpy as np

In [85]:
df = pd.read_csv('data/Names.csv', header=None)
df.columns = ['First Name', 'Last Name', 'Address', 'City', 'State', 'Area Code', 'Income']

In [86]:
df.drop(columns='Address', inplace=True)
df = df.set_index('Area Code')

In [87]:
df

Unnamed: 0_level_0,First Name,Last Name,City,State,Income
Area Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8074,John,Doe,Riverside,NJ,45000
9119,Jack,McGinnis,Phila,PA,18000
8075,"John ""Da Man""",Repici,Riverside,NJ,120000
91234,Stephen,Tyler,SomeTown,SD,90000
298,,Blankman,SomeTown,SD,30000
123,"Joan ""Danger"", Anne",Jet,Desert City,CO,68000


In [88]:
df.loc[8074]

First Name         John
Last Name           Doe
City          Riverside
State                NJ
Income            45000
Name: 8074, dtype: object

In [89]:
df.iloc[0]

First Name         John
Last Name           Doe
City          Riverside
State                NJ
Income            45000
Name: 8074, dtype: object

In [90]:
df.loc[8074:, 'First Name']

Area Code
8074                    John
9119                    Jack
8075           John "Da Man"
91234                Stephen
298                      NaN
123      Joan "Danger", Anne
Name: First Name, dtype: object

In [91]:
df['First Name'].str.split(expand=True)

Unnamed: 0_level_0,0,1,2
Area Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8074,John,,
9119,Jack,,
8075,John,"""Da","Man"""
91234,Stephen,,
298,,,
123,Joan,"""Danger"",",Anne


In [95]:
df['First Name'] = df['First Name'].str.split(expand=True)[0]
df

Unnamed: 0_level_0,First Name,Last Name,City,State,Income
Area Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8074,John,Doe,Riverside,NJ,45000
9119,Jack,McGinnis,Phila,PA,18000
8075,John,Repici,Riverside,NJ,120000
91234,Stephen,Tyler,SomeTown,SD,90000
298,,Blankman,SomeTown,SD,30000
123,Joan,Jet,Desert City,CO,68000


In [96]:
df = df.replace(np.nan, 'N/A', regex=True)
df

Unnamed: 0_level_0,First Name,Last Name,City,State,Income
Area Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8074,John,Doe,Riverside,NJ,45000
9119,Jack,McGinnis,Phila,PA,18000
8075,John,Repici,Riverside,NJ,120000
91234,Stephen,Tyler,SomeTown,SD,90000
298,,Blankman,SomeTown,SD,30000
123,Joan,Jet,Desert City,CO,68000


In [97]:
df.to_excel('output/modified_1.xlsx')