In [1]:
import pandas as pd
from datetime import datetime as dt

In [2]:
df = pd.read_csv('../data/03-Household-Power-Consumption/household_power_consumption.txt',
                 sep=';',na_values = '?')

# Retaining data only from year 2010
df = df[df['Date'].isin([s for s in df['Date'].values if '2010' in s])].reset_index(drop=True)

# Removing all null rows
df = df.loc[df[df.isnull().sum(axis=1) == 0].index,:]

# Converting Date and Time columns into a datetime object and setting that as th index
df['Datetime'] = df.apply(lambda x: ' '.join([x['Date'],x['Time']]),axis=1)
df.index = pd.to_datetime(df['Datetime'],format='%d/%m/%Y %H:%M:%S')
df['Datetime'] = pd.to_datetime(df['Datetime'],format='%d/%m/%Y %H:%M:%S')
df.index.name = 'datetime'
df.drop(labels = ['Date','Time'],axis=1,inplace = True)
df.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Datetime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-01 00:00:00,1.79,0.236,240.65,7.4,0.0,0.0,18.0,2010-01-01 00:00:00
2010-01-01 00:01:00,1.78,0.234,240.07,7.4,0.0,0.0,18.0,2010-01-01 00:01:00
2010-01-01 00:02:00,1.78,0.234,240.15,7.4,0.0,0.0,19.0,2010-01-01 00:02:00
2010-01-01 00:03:00,1.746,0.186,240.26,7.2,0.0,0.0,18.0,2010-01-01 00:03:00
2010-01-01 00:04:00,1.686,0.102,240.12,7.0,0.0,0.0,18.0,2010-01-01 00:04:00


# **Filtering DataFrames**

In [3]:
df.all()

Global_active_power       True
Global_reactive_power    False
Voltage                   True
Global_intensity          True
Sub_metering_1           False
Sub_metering_2           False
Sub_metering_3           False
Datetime                  True
dtype: bool

True columns are those which have no zero entry

## **Selecting columns with no zero entries**

In [5]:
df.loc[:,df.all()].head()

Unnamed: 0_level_0,Global_active_power,Voltage,Global_intensity,Datetime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-01 00:00:00,1.79,240.65,7.4,2010-01-01 00:00:00
2010-01-01 00:01:00,1.78,240.07,7.4,2010-01-01 00:01:00
2010-01-01 00:02:00,1.78,240.15,7.4,2010-01-01 00:02:00
2010-01-01 00:03:00,1.746,240.26,7.2,2010-01-01 00:03:00
2010-01-01 00:04:00,1.686,240.12,7.0,2010-01-01 00:04:00


## **Selecting columns with any NaN entries**

In [8]:
df.loc[:,df.isnull().any()].head()

2010-01-01 00:00:00
2010-01-01 00:01:00
2010-01-01 00:02:00
2010-01-01 00:03:00
2010-01-01 00:04:00


## **Selecting columns without NaNs**

In [10]:
df.loc[:,df.notnull().all()].head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3,Datetime
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-01 00:00:00,1.79,0.236,240.65,7.4,0.0,0.0,18.0,2010-01-01 00:00:00
2010-01-01 00:01:00,1.78,0.234,240.07,7.4,0.0,0.0,18.0,2010-01-01 00:01:00
2010-01-01 00:02:00,1.78,0.234,240.15,7.4,0.0,0.0,19.0,2010-01-01 00:02:00
2010-01-01 00:03:00,1.746,0.186,240.26,7.2,0.0,0.0,18.0,2010-01-01 00:03:00
2010-01-01 00:04:00,1.686,0.102,240.12,7.0,0.0,0.0,18.0,2010-01-01 00:04:00


Since there none of the columns have any NaN values, all columns got selected for .notnull().all()
method

# **Hierarchical indexing**

In [13]:
housing = pd.read_csv('../data/01-House-Price/train.csv')
housing.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [21]:
house = housing[['YrSold','MoSold','LotShape','LandSlope','SalePrice']].set_index(['YrSold','MoSold'])
house.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LotShape,LandSlope,SalePrice
YrSold,MoSold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008,2,Reg,Gtl,208500
2007,5,Reg,Gtl,181500
2008,9,IR1,Gtl,223500
2006,2,IR1,Gtl,140000
2008,12,IR1,Gtl,250000


To make appear hierarchical use the sort_index method

In [22]:
house = house.sort_index()
house.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LotShape,LandSlope,SalePrice
YrSold,MoSold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006,1,Reg,Gtl,260000
2006,1,IR1,Gtl,228000
2006,1,IR1,Gtl,205000
2006,1,IR1,Gtl,172400
2006,1,Reg,Gtl,145000


In [26]:
print(house.index.name)
print(house.index.names)

None
['YrSold', 'MoSold']


### Access houses sold in first month of year 2006

In [28]:
house.loc[(2006,1),:]

Unnamed: 0_level_0,Unnamed: 1_level_0,LotShape,LandSlope,SalePrice
YrSold,MoSold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006,1,Reg,Gtl,260000
2006,1,IR1,Gtl,228000
2006,1,IR1,Gtl,205000
2006,1,IR1,Gtl,172400
2006,1,Reg,Gtl,145000
2006,1,IR2,Gtl,423000
2006,1,Reg,Gtl,181000
2006,1,IR1,Gtl,136500
2006,1,Reg,Gtl,155000
2006,1,Reg,Gtl,105000


### Access sale prices of houses sold in first month of year 2006

In [29]:
house.loc[(2006,1),'SalePrice']

YrSold  MoSold
2006    1         260000
        1         228000
        1         205000
        1         172400
        1         145000
        1         423000
        1         181000
        1         136500
        1         155000
        1         105000
Name: SalePrice, dtype: int64

## **Fancy indexing (outermost indexing)**

### Access sale prices of houses sold in the first month for year 2006 and 2007

In [30]:
house.loc[([2006,2007],1),'SalePrice']

YrSold  MoSold
2006    1         260000
        1         228000
        1         205000
        1         172400
        1         145000
        1         423000
        1         181000
        1         136500
        1         155000
        1         105000
2007    1         127000
        1         122000
        1         228500
        1          39300
        1         755000
        1         171000
        1         132250
        1          75000
        1          86000
        1         143000
        1         178000
        1         122000
        1         203000
Name: SalePrice, dtype: int64

### Access sale prices of houses sold in 1st and last month of year 2007

In [31]:
house.loc[(2007,[1,12]),'SalePrice']

YrSold  MoSold
2007    1         127000
        1         122000
        1         228500
        1          39300
        1         755000
        1         171000
        1         132250
        1          75000
        1          86000
        1         143000
        1         178000
        1         122000
        1         203000
        12        144000
        12        315000
        12        318000
        12        235000
        12        270000
        12         87000
        12        318061
        12        340000
        12        170000
        12        139000
        12        141000
        12        147000
        12         84500
        12        179000
        12        265900
        12        227000
        12        239000
        12        240000
Name: SalePrice, dtype: int64

### Access sale prices of houses sold in the 1st of every available year

In [35]:
house.loc[(slice(None),1),'SalePrice']

YrSold  MoSold
2006    1         260000
        1         228000
        1         205000
        1         172400
        1         145000
        1         423000
        1         181000
        1         136500
        1         155000
        1         105000
2007    1         127000
        1         122000
        1         228500
        1          39300
        1         755000
        1         171000
        1         132250
        1          75000
        1          86000
        1         143000
        1         178000
        1         122000
        1         203000
2008    1         118000
        1         179900
        1         124900
        1         127000
        1         372402
        1         149000
        1         107500
        1         328900
        1         108959
        1         214000
        1         160000
        1          55000
        1         275000
2009    1         181000
        1         274900
        1         131000
        1 

## **Stacking and unstaking DataFrames**

In [39]:
pivot_ex_df = house.reset_index(drop=True)
pivot_ex_df.head()

Unnamed: 0,LotShape,LandSlope,SalePrice
0,Reg,Gtl,260000
1,IR1,Gtl,228000
2,IR1,Gtl,205000
3,IR1,Gtl,172400
4,Reg,Gtl,145000


In [43]:
house.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LotShape,LandSlope,SalePrice
YrSold,MoSold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006,1,Reg,Gtl,260000
2006,1,IR1,Gtl,228000
2006,1,IR1,Gtl,205000
2006,1,IR1,Gtl,172400
2006,1,Reg,Gtl,145000


In [45]:
house_swapped = house.swaplevel(0,1)
house_swapped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LotShape,LandSlope,SalePrice
MoSold,YrSold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2006,Reg,Gtl,260000
1,2006,IR1,Gtl,228000
1,2006,IR1,Gtl,205000
1,2006,IR1,Gtl,172400
1,2006,Reg,Gtl,145000


## **Pivot tables**

In [49]:
house.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,LotShape,LandSlope,SalePrice
YrSold,MoSold,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2006,1,Reg,Gtl,260000
2006,1,IR1,Gtl,228000
2006,1,IR1,Gtl,205000
2006,1,IR1,Gtl,172400
2006,1,Reg,Gtl,145000


In [51]:
house.pivot_table(index='YrSold',columns='MoSold',aggfunc='count')

Unnamed: 0_level_0,LandSlope,LandSlope,LandSlope,LandSlope,LandSlope,LandSlope,LandSlope,LandSlope,LandSlope,LandSlope,...,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice,SalePrice
MoSold,1,2,3,4,5,6,7,8,9,10,...,3,4,5,6,7,8,9,10,11,12
YrSold,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2006,10.0,9.0,25.0,27.0,38.0,48.0,67.0,23.0,15.0,24.0,...,25.0,27.0,38.0,48.0,67.0,23.0,15.0,24.0,16.0,12.0
2007,13.0,8.0,23.0,23.0,43.0,59.0,51.0,40.0,11.0,16.0,...,23.0,23.0,43.0,59.0,51.0,40.0,11.0,16.0,24.0,18.0
2008,13.0,10.0,18.0,26.0,38.0,51.0,49.0,29.0,17.0,22.0,...,18.0,26.0,38.0,51.0,49.0,29.0,17.0,22.0,17.0,14.0
2009,12.0,10.0,19.0,26.0,37.0,59.0,61.0,30.0,20.0,27.0,...,19.0,26.0,37.0,59.0,61.0,30.0,20.0,27.0,22.0,15.0
2010,10.0,15.0,21.0,39.0,48.0,36.0,6.0,,,,...,21.0,39.0,48.0,36.0,6.0,,,,,


The above pivot table is frequency table that shows the number of houses sold in each month of a year.

In [53]:
house.pivot_table(index='YrSold',columns='MoSold',values='SalePrice')

MoSold,1,2,3,4,5,6,7,8,9,10,11,12
YrSold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2006,201090.0,194322.222222,184982.2,174312.814815,158928.289474,172283.333333,183211.059701,196239.956522,223768.866667,172356.708333,213285.0,185545.0
2007,183234.615385,176301.75,176567.782609,170772.608696,169873.511628,179725.813559,195396.843137,183941.075,195533.818182,215428.8125,197709.333333,214414.5
2008,178504.692308,159370.0,178505.277778,159293.346154,188334.473684,169730.941176,174562.653061,196076.965517,169626.470588,166690.636364,210981.058824,175600.0
2009,189735.5,187450.0,171547.368421,181680.769231,164482.945946,183260.932203,197984.409836,165670.966667,196849.35,175206.592593,156381.818182,164014.533333
2010,163852.6,174823.333333,203181.285714,171344.025641,178422.25,181639.583333,121750.0,,,,,


# **Filtering and grouping with .map ()** 

In [59]:
house['SalePrice'].describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

### Lets find number of houses above and below 500k

In [64]:
under500k = (house['SalePrice'] < 500000).map({True:'Under 500k',False:'Over 500k'})

In [65]:
house.groupby(under500k)['SalePrice'].count()

SalePrice
Over 500k        9
Under 500k    1451
Name: SalePrice, dtype: int64

In [67]:
house_ShapeSlope = house[['LotShape','LandSlope']]

In [69]:
house_ShapeSlope.drop_duplicates()

Unnamed: 0_level_0,Unnamed: 1_level_0,LotShape,LandSlope
YrSold,MoSold,Unnamed: 2_level_1,Unnamed: 3_level_1
2006,1,Reg,Gtl
2006,1,IR1,Gtl
2006,1,IR2,Gtl
2006,3,IR1,Mod
2006,4,Reg,Mod
2006,7,IR3,Gtl
2006,10,IR1,Sev
2006,12,Reg,Sev
2007,6,IR2,Sev
2007,6,IR2,Mod


In [70]:
house.SalePrice.max()

755000

In [71]:
house.SalePrice.idxmax()

(2007, 1)

Highest priced house was sold in first month of 2007