In [None]:
This notebook looks to show beginning steps to look into the data we have available.

Process:
Read data
.describe
check if min and max for numeric make sense
Check if any nulls in columns

# Imports

In [1]:
import os
os.chdir("../../coding_data/cookbook_data/")
os.getcwd()

import pandas as pd
import numpy as np

pd.set_option('max_columns', 4, 'max_rows', 10, 'max_colwidth', 12)

# Initial summary statistics

In [61]:
college = pd.read_csv('data/college.csv')

college.shape
college.info()
college.describe(include=[np.number]).T
college.describe(include=[np.object, pd.Categorical]).T # including object columns

# can include percentile in .describe method
college.describe(include=[np.number],
   percentiles=[.01, .05, .10, .25, .5,
                .75, .9, .95, .99]).T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

Unnamed: 0,count,mean,...,99%,max
HBCU,7164.0,0.014238,...,1.000000,1.0
MENONLY,7164.0,0.009213,...,0.000000,1.0
WOMENONLY,7164.0,0.005304,...,0.000000,1.0
RELAFFIL,7535.0,0.190975,...,1.000000,1.0
SATVRMID,1185.0,522.819409,...,730.000000,765.0
...,...,...,...,...,...
PPTUG_EF,6853.0,0.226639,...,0.946724,1.0
CURROPER,7535.0,0.923291,...,1.000000,1.0
PCTPELL,6849.0,0.530643,...,0.993908,1.0
PCTFLOAN,6849.0,0.522211,...,0.986368,1.0


Describe method for object columns where we have the following information:
* counts
* unique values
* most frequent value
* frequency of most frequent value

In [62]:
# just object columns

college.describe(include=object)

Unnamed: 0,INSTNM,CITY,...,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
count,7535,7535,...,6413,7503
unique,7535,2514,...,598,2038
top,Briarcli...,New York,...,PrivacyS...,PrivacyS...
freq,1,87,...,822,1510


# Data types and saving memory

Load some data to look at data types

In [63]:
# choose some columns to see how much memory can be saved
college = pd.read_csv('data/college.csv')

# select certain columns
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER',
   'INSTNM', 'STABBR']
col2 = college.loc[:, different_cols]

### Changing Numerical columns

Get column types

In [51]:
col2.dtypes

RELAFFIL      int64
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object

check memory usage

In [52]:
original_mem = col2.memory_usage(deep=True)
original_mem

Index          128
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

change type to save thata as 'RELAFFIL' column is binary

In [53]:
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)    

see memory usage saved

In [54]:
col2.memory_usage(deep=True)

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64

### Changing type to categorical columns (for categorical data)

load data

In [55]:
### choose some columns to see how much memory can be saved
# college = pd.read_csv('data/college.csv')

### select certain columns
# different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER',
#    'INSTNM', 'STABBR']
# col2 = college.loc[:, different_cols]

In [56]:
print('Original data:')
print( col2.dtypes)
print('\n')
print('Original memory usage:')
print(college[different_cols].memory_usage(deep=True))

Original data:
RELAFFIL       int8
SATMTMID    float64
CURROPER      int64
INSTNM       object
STABBR       object
dtype: object


Original memory usage:
Index          128
RELAFFIL     60280
SATMTMID     60280
CURROPER     60280
INSTNM      660240
STABBR      444565
dtype: int64


Number of object columns

In [57]:
col2.select_dtypes(include=['object']).nunique()

INSTNM    7535
STABBR      59
dtype: int64

Change object column to category

In [58]:
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

RELAFFIL        int8
SATMTMID     float64
CURROPER       int64
INSTNM        object
STABBR      category
dtype: object

Check memory we have saved

In [59]:
new_mem = col2.memory_usage(deep=True)
new_mem

Index          128
RELAFFIL      7535
SATMTMID     60280
CURROPER     60280
INSTNM      660699
STABBR       13576
dtype: int64

Below we can see that we reduced the memory to an eighth of the memory for RELAFFIL and 3% for STABBR!

In [60]:
# run section above
new_mem / original_mem

Index       1.000000
RELAFFIL    0.125000
SATMTMID    1.000000
CURROPER    1.000000
INSTNM      1.000695
STABBR      0.030538
dtype: float64

### Column types for data

data types per column

In [64]:
college.dtypes

INSTNM                 object
CITY                   object
STABBR                 object
HBCU                  float64
MENONLY               float64
                       ...   
PCTPELL               float64
PCTFLOAN              float64
UG25ABV               float64
MD_EARN_WNE_P10        object
GRAD_DEBT_MDN_SUPP     object
Length: 27, dtype: object

Number of columns with specific data types

In [65]:
college.dtypes.value_counts()

float64    20
object      5
int64       2
dtype: int64