In [None]:
import os
os.chdir('/Users/hj020/Desktop/2022/EconomicAnalytics-master/Python_/Data')
# Set up a working directory to the folder where your datasets are included
# The figures, etc that are produced from your program will be stored in the folder as well

In [None]:
# install modules and packages
import numpy as np # module for data manipulation (similar to matlab)
import pandas as pd # module for data manipulation (similar to stata)
import math # math fns

In [None]:
# load data using pd.read_csv (data.csv will be uploaded to your workspace)
raw0 = pd.read_csv('data0.csv')

### <font color='green'> Differences between Numpy and Panda </font>
https://discuss.codecademy.com/t/what-are-some-differences-between-pandas-numpy-and-matplotlib/354475


* The biggest difference is that Panda creates and uses tabular data ("Panda Frame" table form -> user friendly) that can be easily visualized. However it accompanies many built-in functions (e.g. row, column names) it may be very slow in optimization/computation. In contrast, Numpy create and uses very simple numerical arrays (e.g. vector and matrix) that are superior in mathmatical/numerical computation.


* We will learn how to convert panda dataframe to numpy array using raw0.values, and see the differences between the two in accessing and manipulating data.

In [None]:
# check how data is uploaded 
# the column for observation numbers and the row for variable names are not part of data
# Python index starts from 0 (0-indexed)
raw0.head()

In [None]:
raw0.values

In [None]:
# drop missing values
raw0=raw0.dropna()

In [None]:
# check the length of data (= n)
len(raw0)

In [None]:
# check the shape of data (n,p)
raw0.shape

### <font color='green'> Datatypes
https://pbpython.com/pandas_dtypes.html
    
https://docs.python.org/3/tutorial/floatingpoint.html

* Most frequently used types are integer, float (similar to decimal number; see the second reference for more information about float) and string.

* Each type takes different space in computer and some function only work for specific types, so sometimes we need to change the type of data.

In [None]:
# check the datatypes of data
raw0.dtypes

In [None]:
# change data types (census, traffic and blacktract to integer)
# astype() produces a copy so the copy should be assigned to the column of the original data for the change to be effective
raw0['census'] = raw0.census.astype(int)
raw0['traffic'] = raw0.traffic.astype(int)
raw0['blacktract'] = raw0.blacktract.astype(int)

In [None]:
# check the datatypes of data again
raw0.dtypes

### <font color='green'> Accessing/selecting row(s), column(s) and cell(s) in Panda
https://www.shanelynn.ie/select-pandas-dataframe-rows-and-columns-using-iloc-loc-and-ix/

https://towardsdatascience.com/a-python-beginners-look-at-loc-part-1-cb1e1e565ec2
    
* iloc - locate data points based on "updated" row \& column numbers when some rows or columns are removed 
* using variable name - locate data points based on "original" row \& column numbers assigned when data was read
* loc - useful when selecting rows or columns that satisfies certain conditions (e.g. >, ==)

In [None]:
# select a cell using iloc
raw0.iloc[1, 0]

In [None]:
raw0.iloc[99277,0] 
# the last element in a list can be accessed using index -1 (this, however, doesn't work when variable name is used to select a column)

In [None]:
# access a part of a string in a cell using iloc
raw0.iloc[0, 0][2]

In [None]:
# select a column using a variable name
raw0[['date','census']]

In [None]:
# access a part of a string in cell
raw0['date'][0][:3]

In [None]:
raw0['date'][102509]

### <font color='red'> In-Class Exercise 1: Creating a dummy for person's gender

* We want to create a dummy variable that assigns 1 if driver is female (F), 0 otherwise

In [None]:
# check a list of unque elements (categories)
set(raw0['person_gender'])

### <font color='green'> Important Operators in Python
https://www.programiz.com/python-programming/operators
    
https://www.geeksforgeeks.org/python-operators/
    
* Arithmetic
* Logical/Identity/Comparison
* Assignment

In [None]:
raw0.person_gender == 'F'

In [None]:
raw0.person_gender == 'M'

In [None]:
(raw0.person_gender == 'F') | (raw0.person_gender == 'M')

In [None]:
# remove the rows with "u"
raw0 = raw0.loc[(raw0.person_gender == 'F') | (raw0.person_gender == 'M')]

In [None]:
(raw0.person_gender == 'M')|(raw0.person_gender == 'F')

In [None]:
set(raw0['person_gender'])

In [None]:
# replace "person_gender" with a dummy that is 1 if F, 0 otherwise
raw0.person_gender = (raw0.person_gender == 'F')*1

In [None]:
set(raw0['sex_1'])

In [None]:
# do the same for sex_1
raw0['sex_1']= (raw0.sex_1 == 'F')*1

In [None]:
raw0.head()

### <font color='red'> In-Class Exercise 2: Creating a set of dummies for person's race

* we want to create two dummies
    - First dummy that is 1 if driver is B, 0 otherwise
    - Second dummy that is 1 if driver is A or I or O or U (W is a baseline group)

In [None]:
# check a list of unque elements in person_race and create/add a dummy, D_B, to data
# there are many other ways to add new columns to data in Panda (see https://www.geeksforgeeks.org/adding-new-column-to-existing-dataframe-in-pandas/)

raw0['D_B'] = (raw0.person_race == 'B')*1

In [None]:
set(raw0['person_race'])

In [None]:
raw0.head()

In [None]:
# do the same for the other dummy
temp2 = ((raw0.person_race == 'A')|(raw0.person_race == 'I')|(raw0.person_race == 'O')|(raw0.person_race == 'U'))*1
raw0['D_Other'] = temp2

In [None]:
raw0.head()

### <font color='red'> In-Class Exercise 3: creating age variable for driver

* Definition of age: age = date(string, yyyy-mm-dd) -person_dob(string, yyyy-mm-dd)

In [None]:
# [Step 1]
# access the years in "person_dob" and "date" and store them in "dyear" and "byear"
# covert the strings to integers and calculate the difference
dyear=raw0['date'][0][:4]
byear=raw0['person_dob'][0][:4]
dyearn = int(dyear)
byearn = int(byear)
age = dyearn - byearn
age

In [None]:
# [Step 2]
# access the months in "person_dob" and "date" and store them in "dmon" and "bmon"
# covert the strings to integers and calculate the difference
# if the difference in month is negative, then subtract one from age
dmon=raw0['date'][0][5:7]
bmon=raw0['person_dob'][0][5:7]
dmonn = int(dmon)
bmonn = int(bmon)
mond = dmonn - bmonn
if mond < 0:
    age = age -1
age

### <font color='green'> For-Loops in Python
https://www.w3schools.com/python/python_for_loops.asp
    
### <font color='green'> If statements in Python
https://www.w3schools.com/python/python_conditions.asp


In [None]:
# use a loop to repeat this for all the observations
D_age = np.zeros((len(raw0),),dtype=int)
for i in range(0,len(raw0)):
    age=int(raw0.iloc[i,0][:4]) - int(raw0.iloc[i,4][:4])
    mond=int(raw0.iloc[i,0][5:7]) - int(raw0.iloc[i,4][5:7])
    if mond < 0:
        age = age -1
    D_age[i]=age

In [None]:
# add it to raw0
raw0['D_age'] = D_age

In [None]:
raw0.head()

### <font color='darkred'> HW1: Similarly as we have done for D_age,
1. create an age variable for officer: O_age
2. create a tenure variable for officer: Exp, which is defined as exp = date - apptdate_1
3. Append the two variables to raw0