In [49]:
import numpy as np
import pandas as pd
from io import StringIO
csv= '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
0.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv))

In [10]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [13]:
# return count of null values per column
print(df.isnull().sum())

# return count of null values per row
print(df.isnull().sum(axis=1))

A    0
B    0
C    1
D    1
dtype: int64
0    0
1    1
2    1
dtype: int64


In [20]:
# remove whole rows with null values
print(df.dropna())

# remove whole columns with null values
print(df.dropna(axis=1))

# only drop rows where all columns are NaN
df.dropna(how='all')
    
# remove whole rows where column C is null
df.dropna(subset=['C'])

     A    B    C    D
0  1.0  2.0  3.0  4.0
     A     B
0  1.0   2.0
1  5.0   6.0
2  0.0  11.0


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,0.0,11.0,12.0,


In [44]:
# filling in missing data with means
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0) # take mean of col
imr = imr.fit(df)
imputed_data = imr.transform(df)
imputed_data

ValueError: could not convert string to float: 'class1'

In [133]:
df = pd.DataFrame([
    ['green', 'M', 10.1, 'class1'],
    ['red', 'L', 13.5, 'class2'],
    ['blue', 'XL', 15.3, 'class1']
], columns = ['color', 'size', 'price', 'class'])
df

Unnamed: 0,color,size,price,class
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [134]:
size_mapping = {
    'XL': 3,
    'L': 2,
    'M': 1
}
inv_size_mapping = {v: k for k, v in size_mapping.items()} # reverse keys and values
df['size'] = df['size'].map(size_mapping)
df
#df['size'] = df['size'].map(inv_size_mapping)
#df

Unnamed: 0,color,size,price,class
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [135]:
# auto generate map to assign each unique class an integer
class_mapping = {label: idx for idx, label in enumerate(np.unique(df['class']))}
inv_class_mapping = { v: k for k, v in class_mapping.items()}
df['class'] = df['class'].map(class_mapping)
df
#df['class'] = df['class'].map(inv_class_mapping)
#df

Unnamed: 0,color,size,price,class
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [136]:
# using sklearn for same purpose
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['class'].values)
print(y)       
#y = class_le.inverse_transform(y)
#print(y)

[0 1 0]


In [137]:
# since color are nominal not ordinal, it would not be ideal to transform them directly to an integer index

# 1. first transform color names to integers
color_le = LabelEncoder()
df['color_i'] = class_le.fit_transform(df['color'].values)

# 2. apply onehot encoding to split each color into its own column
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0]) # apply on first column
X = df[['color_i','size','price']].values
dense_matrix = ohe.fit_transform(X)
dense_matrix.toarray()

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [147]:
# same purpose done though pd dummies
# note: with this, colors do NOT need to be transformed into int beforehand
df.drop('color_i', axis=1, inplace=True)
pd.get_dummies(df[['color','size','price']])

Unnamed: 0,size,price,color_blue,color_green,color_red
0,1,10.1,0,1,0
1,2,13.5,0,0,1
2,3,15.3,1,0,0


In [158]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', 
    header=None,
    names=['Class label',
          'Alcohol',
          'Malic acid', 
          'Ash',
          'Alcalinity of ash', 
          'Magnesium',
          'Total phenols', 
          'Flavanoids',
          'Nonflavanoid phenols',
          'Proanthocyanins',
          'Color intensity', 
          'Hue',
          'OD280/OD315 of diluted wines',
          'Proline'])
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [160]:
# same as ch3
# split into train test dataset
from sklearn.model_selection import train_test_split
X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=0)

In [169]:
# feature scaling
# Normalization: diff to min/range
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test) # again, scale test with train's scale

# Standardiztion: Z score
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [170]:
# L2 regulatiztion: sum of squares of weights
# L1 regulatization: sum of weights (stronger regularization)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)
print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))

Training accuracy: 0.9838709677419355
Test accuracy: 0.9814814814814815


In [180]:
# y intercept when X is all 0; representative of the reg bias
lr.intercept_

array([-0.38380921, -0.15810012, -0.70039199])

In [182]:
# weight coef for class 0, 1 & 2 resp.
lr.coef_

array([[ 0.28001689,  0.        ,  0.        , -0.02795529,  0.        ,
         0.        ,  0.70995776,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.2365037 ],
       [-0.6439199 , -0.06881946, -0.05719684,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        , -0.92693494,
         0.06004165,  0.        , -0.37102705],
       [ 0.        ,  0.0613494 ,  0.        ,  0.        ,  0.        ,
         0.        , -0.6369529 ,  0.        ,  0.        ,  0.49839585,
        -0.35828216, -0.57037875,  0.        ]])