In [103]:
import pandas as pd
import numpy as np
from pathlib import Path

##### The Pandas Series Object

In [104]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data.values
data.index
print(data[1:3])

1    0.50
2    0.75
dtype: float64


In [105]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
data['c']

0.75

In [106]:
population_dict = {'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

##### The Pandas DataFrame Object

In [107]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}

area = pd.Series(area_dict)
area

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

In [108]:
states = pd.DataFrame({'population': population, 'area' : area})
print(states)
states.index

            population    area
California    38332521  423967
Texas         26448193  695662
New York      19651127  141297
Florida       19552860  170312
Illinois      12882135  149995


Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [109]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [110]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])
#If some keys are missing pandas will fill them with NaN

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [111]:
pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.233344,0.853508
b,0.617936,0.180814
c,0.195677,0.837516


In [112]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967})
population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127})
pd.DataFrame(population / area)

Unnamed: 0,0
Alaska,
California,90.413926
New York,
Texas,38.01874


##### Creating DFs from CSV File

In [113]:
path = Path('.', 'data', 'AutoDataset.csv')

df = pd.read_csv(path)
df.head()

#From here we can conclude that only the columns 'Row ID" and 'Sales' are numeric
df.dtypes

#This is descriptive statistics
df.describe()

Unnamed: 0,Row ID,Sales
count,12.0,12.0
mean,4071.5,109.63
std,2721.099427,148.25714
min,2314.0,2.97
25%,2316.75,13.635
50%,2587.5,34.25
75%,4490.75,152.205
max,9261.0,416.32


In [114]:
header_names = pd.read_csv(path, nrows=0).columns # load 0 rows, basically just load the headers
df = pd.read_csv(path, skiprows=4, names=header_names) # skip 4 rows (1 for the header, and 3 from the actual data set), and set names param = header_names (names: Sequence of column labels to apply.)
df.head()

Unnamed: 0,Row ID,Order ID,Ship Date,Customer Name,Country,City,State,Category,Sub-Category,Product Name,Sales
0,2317,CA-2017-122035,7/25/2017,Elizabeth Moffitt,United States,Sioux Falls,South Dakota,Furniture,Chairs,Novimex Fabric Task Chair,182.94
1,2318,CA-2017-122035,7/25/2017,Elizabeth Moffitt,United States,Sioux Falls,South Dakota,Office Supplies,Binders,Cardinal Slant-D Ring Binders,60.83
2,2319,CA-2017-122035,7/25/2017,Elizabeth Moffitt,United States,Sioux Falls,South Dakota,Technology,Accessories,Logitech G35 7.1-Channel Surround Sound Headset,389.97
3,2856,CA-2017-169810,7/31/2017,Raymond Buch,United States,Sioux Falls,South Dakota,Office Supplies,Labels,Avery 498,20.23
4,3270,CA-2014-115980,7/19/2014,Victoria Wilson,United States,Sioux Falls,South Dakota,Technology,Accessories,Maxell 4.7GB DVD-R 5/Pack,2.97


In [115]:
# This will parse Ship Date column as date
df = pd.read_csv(path, parse_dates=['Ship Date'])
df.head()

df.dtypes

Row ID                    int64
Order ID                 object
Ship Date        datetime64[ns]
Customer Name            object
Country                  object
City                     object
State                    object
Category                 object
Sub-Category             object
Product Name             object
Sales                   float64
dtype: object

In [116]:
# This will rename some column in dataset

df = pd.read_csv(path)
df.rename(columns={'Sales': 'TotalSales'}, inplace=True)
df.head()

Unnamed: 0,Row ID,Order ID,Ship Date,Customer Name,Country,City,State,Category,Sub-Category,Product Name,TotalSales
0,2314,CA-2017-122035,7/25/2017,Elizabeth Moffitt,United States,Sioux Falls,South Dakota,Office Supplies,Labels,Avery 486,14.62
1,2315,CA-2017-122035,7/25/2017,Elizabeth Moffitt,United States,Sioux Falls,South Dakota,Office Supplies,Appliances,"1.7 Cubic Foot Compact ""Cube"" Office Refrigera...",416.32
2,2316,CA-2017-122035,7/25/2017,Elizabeth Moffitt,United States,Sioux Falls,South Dakota,Office Supplies,Binders,Avery Printable Repositionable Plastic Tabs,43.0
3,2317,CA-2017-122035,7/25/2017,Elizabeth Moffitt,United States,Sioux Falls,South Dakota,Furniture,Chairs,Novimex Fabric Task Chair,182.94
4,2318,CA-2017-122035,7/25/2017,Elizabeth Moffitt,United States,Sioux Falls,South Dakota,Office Supplies,Binders,Cardinal Slant-D Ring Binders,60.83


##### Creating DFs from Excel File

In [117]:
path = Path('.', 'data', 'Order.xlsx')
df = pd.read_excel(path, sheet_name='Orders')
df.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [118]:
df.rename(columns={'City': 'State', 'Customer Name': 'Customer'}, inplace=True)
df.rename(columns={'Product Name': 'Product'}, inplace=True)
df.head()


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer,Segment,Country,State,...,Postal Code,Region,Product ID,Category,Sub-Category,Product,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [119]:
df['Category'].unique()
pd.DataFrame(df['Region'].unique(), columns=['US'])


Unnamed: 0,US
0,South
1,West
2,Central
3,East


In [120]:
#Dropping the column State
df_1 = df.drop(['State'], axis='columns')
df_1.head()

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer,Segment,Country,Postal Code,Region,Product ID,Category,Sub-Category,Product,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


#### Exercise with Pandas

In [123]:
path_loan = Path('.', 'data', 'LoanData.csv')
df = pd.read_csv(path_loan, sep = ',')
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,policy_code,application_type,annual_inc_joint,verification_status_joint,acc_now_delinq,tot_cur_bal,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.0,36 months,10.65%,162.87,B,B2,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0
1,1077430,1314167,2500,2500,2500.0,60 months,15.27%,59.83,C,C4,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0
2,1077175,1313524,2400,2400,2400.0,36 months,15.96%,84.33,C,C5,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0
3,1076863,1277178,10000,10000,10000.0,36 months,13.49%,339.31,C,C1,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0
4,1075358,1311748,3000,3000,3000.0,60 months,12.69%,67.79,B,B5,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0


In [124]:
#How many member_id appear more than once?

df['member_id'].value_counts()

member_id
1296599    1
731393     1
731544     1
729629     1
731390     1
          ..
989001     1
988993     1
988959     1
988954     1
86999      1
Name: count, Length: 39717, dtype: int64

In [126]:
#Count how many unique sub_grades there are

df['sub_grade'].nunique()

35

In [128]:
#List the unique sub_grades in a new data frame with column named 'sub_grade_index'
pd.DataFrame(df['sub_grade'].unique(), columns=['sub_grade_index'])

Unnamed: 0,sub_grade_index
0,B2
1,C4
2,C5
3,C1
4,B5
5,A4
6,E1
7,F2
8,C3
9,B1


In [129]:
#Sort the dataset by loan_amnt ASC and installment DESC

df.sort_values(by=['loan_amnt', 'installment'], ascending=[True, False])


Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,policy_code,application_type,annual_inc_joint,verification_status_joint,acc_now_delinq,tot_cur_bal,chargeoff_within_12_mths,delinq_amnt,pub_rec_bankruptcies,tax_liens
39397,211133,210644,500,500,475.00000,36 months,11.41%,16.47,C,C3,...,1,INDIVIDUAL,,,0,,0.0,0,,0.0
39275,242695,242682,500,500,500.00000,36 months,10.71%,16.31,B,B5,...,1,INDIVIDUAL,,,0,,0.0,0,,0.0
39373,216698,174214,500,500,500.00000,36 months,10.46%,16.25,B,B5,...,1,INDIVIDUAL,,,0,,0.0,0,,0.0
38863,312505,312443,500,500,450.00000,36 months,9.76%,16.08,B,B2,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0
39544,164346,163835,500,500,400.00000,36 months,8.07%,15.69,A,A4,...,1,INDIVIDUAL,,,0,,0.0,0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9186,836299,1046319,35000,22100,21825.00000,60 months,11.49%,485.93,B,B4,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0
15203,752865,952485,35000,21275,21225.00000,60 months,12.99%,483.97,C,C1,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0
9223,853254,1034194,35000,21850,21825.00000,60 months,11.49%,480.43,B,B4,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0
702,1056057,1287630,35000,21275,21250.00000,60 months,11.71%,470.14,B,B3,...,1,INDIVIDUAL,,,0,,0.0,0,0.0,0.0


#### Working with Large Datasets

In [132]:
path = Path('.', 'data', 'AirQuality.csv')
chunk_size = 1000

chunks = 0
sum_of_air_quality = 0

for chunk in pd.read_csv(path, chunksize=chunk_size):
    chunk_sum = chunk['Data Value'].sum()
    sum_of_air_quality += chunk_sum
    chunks += 1
    
print(chunks)
print(sum_of_air_quality)

19
386250.80000000005
