# Feb 5th Note

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('grades.csv')

In [3]:
df

Unnamed: 0,name,q1,q2,q3,t1,t2,final
0,amit,4,2,5,87,23,59.0
1,bearegard,3,4,2,78,34,67.0
2,cecily,3,5,3,34,40,
3,deshaun,5,5,5,40,39,69.0
4,esmeralda,4,3,4,37,34,55.0


In [4]:
df.dtypes

name      object
q1         int64
q2         int64
q3         int64
t1         int64
t2         int64
final    float64
dtype: object

In [5]:
# select those that are number
df1 = df.select_dtypes(np.number)
df1

Unnamed: 0,q1,q2,q3,t1,t2,final
0,4,2,5,87,23,59.0
1,3,4,2,78,34,67.0
2,3,5,3,34,40,
3,5,5,5,40,39,69.0
4,4,3,4,37,34,55.0


## Dropping NaN

In [6]:
# ~ means 'Not', which reverses booleans
df[~df['final'].isna()]

Unnamed: 0,name,q1,q2,q3,t1,t2,final
0,amit,4,2,5,87,23,59.0
1,bearegard,3,4,2,78,34,67.0
3,deshaun,5,5,5,40,39,69.0
4,esmeralda,4,3,4,37,34,55.0


In [7]:
df.dropna(axis=0, how='any')

Unnamed: 0,name,q1,q2,q3,t1,t2,final
0,amit,4,2,5,87,23,59.0
1,bearegard,3,4,2,78,34,67.0
3,deshaun,5,5,5,40,39,69.0
4,esmeralda,4,3,4,37,34,55.0


In [8]:
df1 = df1.dropna(axis=0, how='any')
df1

Unnamed: 0,q1,q2,q3,t1,t2,final
0,4,2,5,87,23,59.0
1,3,4,2,78,34,67.0
3,5,5,5,40,39,69.0
4,4,3,4,37,34,55.0


In [9]:
y = df1['final']
y

0    59.0
1    67.0
3    69.0
4    55.0
Name: final, dtype: float64

## Dropping the final column so that we can predict them

In [10]:
df1 = df1.drop(columns='final')

In [11]:
df1

Unnamed: 0,q1,q2,q3,t1,t2
0,4,2,5,87,23
1,3,4,2,78,34
3,5,5,5,40,39
4,4,3,4,37,34


In [12]:
np.ones(32)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [13]:
# Something looks wrong here
pd.concat( [pd.DataFrame({'intercept':np.ones(len(df1))}), df1], axis=1)

Unnamed: 0,intercept,q1,q2,q3,t1,t2
0,1.0,4.0,2.0,5.0,87.0,23.0
1,1.0,3.0,4.0,2.0,78.0,34.0
2,1.0,,,,,
3,1.0,5.0,5.0,5.0,40.0,39.0
4,,4.0,3.0,4.0,37.0,34.0


In [14]:
df1.reset_index()

Unnamed: 0,index,q1,q2,q3,t1,t2
0,0,4,2,5,87,23
1,1,3,4,2,78,34
2,3,5,5,5,40,39
3,4,4,3,4,37,34


In [15]:
df1.reset_index(drop=True)

Unnamed: 0,q1,q2,q3,t1,t2
0,4,2,5,87,23
1,3,4,2,78,34
2,5,5,5,40,39
3,4,3,4,37,34


In [16]:
# Now this seems working
X = pd.concat( [pd.DataFrame({'intercept':np.ones(len(df1))}), df1.reset_index(drop=True)], axis=1)
X

Unnamed: 0,intercept,q1,q2,q3,t1,t2
0,1.0,4,2,5,87,23
1,1.0,3,4,2,78,34
2,1.0,5,5,5,40,39
3,1.0,4,3,4,37,34


In [17]:
y

0    59.0
1    67.0
3    69.0
4    55.0
Name: final, dtype: float64

In [18]:
(beta_hat, residuals, rank, s) = np.linalg.lstsq(X, y, rcond=-1)
beta_hat

array([-0.28891963,  1.10624344,  3.61590521,  1.16942108,  0.28851659,
        0.72539506])

In [19]:
df

Unnamed: 0,name,q1,q2,q3,t1,t2,final
0,amit,4,2,5,87,23,59.0
1,bearegard,3,4,2,78,34,67.0
2,cecily,3,5,3,34,40,
3,deshaun,5,5,5,40,39,69.0
4,esmeralda,4,3,4,37,34,55.0


In [20]:
np.matmul(np.array([1,3,5,3,34,40]), beta_hat)

63.44296644251851

In [21]:
X

Unnamed: 0,intercept,q1,q2,q3,t1,t2
0,1.0,4,2,5,87,23
1,1.0,3,4,2,78,34
2,1.0,5,5,5,40,39
3,1.0,4,3,4,37,34


In [22]:
X.dot(beta_hat)

0    59.0
1    67.0
2    69.0
3    55.0
dtype: float64

In [23]:
df['final'].apply(lambda x : 'A' if x > 60 else 'B')

0    B
1    A
2    B
3    A
4    B
Name: final, dtype: object