## Lecture 2 

### Pt1 - Informal Data Science Introduction

In [None]:
import numpy as np
import pandas as pd
from fairlearn.datasets import fetch_adult
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report

In [None]:
raw_data = fetch_adult(as_frame=True)
raw_df = raw_data['data'].sample(frac=0.2,random_state=42).copy()
raw_df = raw_data['data'].copy()

In [None]:
raw_df.head()

In [None]:
raw_df.describe()

In [None]:
raw_df.dropna().describe()

In [None]:
raw_data['target']

In [None]:
raw_df.dtypes

In [None]:
raw_data['target']

In [None]:
raw_df['target'] = raw_data['target']
df = raw_df.dropna()
df.drop(columns=['fnlwgt'],inplace=True)

In [None]:
df = raw_df.dropna().copy() # note the copy command
df.drop(columns=['fnlwgt'],inplace=True)

In [None]:
df = pd.get_dummies(df, drop_first=True)
df.head()

In [None]:
num_cols = raw_df.drop(columns=['fnlwgt']).columns[
    raw_df.drop(columns=['fnlwgt']).dtypes=='int64']
df[num_cols] = (df[num_cols]-df[num_cols].mean())/df[num_cols].std()
df.head()

In [None]:
df_train, df_test = train_test_split(df)
df_test

In [None]:
clf = RandomForestClassifier()
clf.fit(df_train.drop(columns=['target_>50K']),df_train['target_>50K'].values)
df_test['y_pred'] = clf.predict(df_test.drop(columns=['target_>50K']))

In [None]:
df_test[['target_>50K','y_pred']]

In [None]:
cm = confusion_matrix(y_pred=df_test.y_pred, y_true=df_test['target_>50K'].values)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

In [None]:
print(classification_report(y_pred=df_test.y_pred, y_true=df_test['target_>50K'].values))

### Pt2 - More formal introduction to Pandas

In [None]:
df = pd.DataFrame({'A':np.arange(5),'B':np.random.random(5),'C':['apple','orange','pear','apple','orange']})
df

In [None]:
df = pd.DataFrame({'A':np.arange(5),'B':np.random.random(5),'C':['apple','orange','pear','apple','orange']},index=np.random.randint(0,10,5))
df

In [None]:
df.mean()

In [None]:
df[['A','B']].mean() # Note the double brackets, we select with [] and a list

In [None]:
df.sort_values(by='A')

In [None]:
df.sort_values(by='B')

In [None]:
df.sort_values(by='C')

In [None]:
df.sort_values(by='C',inplace=True)
df

In [None]:
df.rename(columns={'C':'fruits'})

In [None]:
df

In [None]:
df.rename(columns={'C':'fruits'},inplace=True)
df

In [None]:
for i in df.index:
    print(i)

In [None]:
df.reset_index(drop=True,inplace=True)
df

In [None]:
df.set_index(df.fruits)

In [None]:
df.reset_index(drop=True,inplace=True)
df['C'] = np.random.rand(1,5)[0]
df

In [None]:
df['D'] = df['C'] + 1
df

In [None]:
df.set_index(np.random.randint(0,100,5),inplace=True)
df

In [None]:
print(df.loc[17]) #select on index
print(df.iloc[0]) #select on row number
print(df['A']) # select column

In [None]:
df.iloc[0,2]

In [None]:
df.loc[17,['fruits','A']]

In [None]:
df.loc[:,['fruits','A']].reset_index(drop=True).rename(columns={'fruits':'products'})

In [None]:
df2 = df.loc[:,['fruits','A']].reset_index(drop=True).rename(columns={'fruits':'products'})

### Exercises

0. Setup a python environment with Pandas

1.  Define the dataframe below and then sort it simultaneously by columns A and B. 

```df = pd.DataFrame({'A':np.arange(10),'B':np.random.random(10),'C':['car','bus','aeroplane','car','bus','car','bus','aeroplane','car','bus']},index=np.random.randint(0,10,5))```

2. Using the DF above sort it by columns C in reverse alphabetical order

3. Add a column D which is the mean of columns A and B

4. Using the chain rule reset the index, select only the numerical columns and find the median of every column 

5. Using the ```axis``` parameter find the median of every row. 

6. Replace all values that say 'cars' with 'electric cars'  