In [1]:
import pandas as pd
from io import StringIO
import sys
csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,,12.0,


In [2]:
df.isnull().sum()  # sum up values in each column

A    0
B    1
C    1
D    1
dtype: int64

In [3]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,True,False,True


In [4]:
df.dropna(axis=0)  # drop raws with enpty values

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [5]:
df.dropna(axis=1)

Unnamed: 0,A
0,1.0
1,5.0
2,10.0


In [6]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,,12.0,


In [7]:
df.dropna(thresh=3)  # keep the raws with >= 3 features

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0


In [8]:
df.dropna(subset=['C'])  # remove data that are empyty in C column

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,,12.0,


In [9]:
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)  # estimate parameters (mean here)
imr.transform(df.values)  # replace nan with estimated parameters



array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. ,  4. , 12. ,  6. ]])

In [10]:
import pandas as pd
df2 = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                   ['red', 'L', 13.5, 'class1'], 
                   ['blue', 'XL', 15.3, 'class2'], 
                   ['green', 'XL', 12.5, 'class2']])
df2.columns = ['color', 'size', 'price', 'classlabel']
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2
3,green,XL,12.5,class2


In [11]:
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df2['size'] = df2['size'].map(size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2
3,green,3,12.5,class2


In [12]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df2['size'] = df2['size'].map(inv_size_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2
3,green,XL,12.5,class2


In [13]:
import numpy as np
class_mapping = {label: idx for idx, label in enumerate(np.unique(df2['classlabel']))}
df2['classlabel'] = df2['classlabel'].map(class_mapping)
df2  # nominal label. can fit label and number randomly

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,1
1,red,L,13.5,0
2,blue,XL,15.3,1
3,green,XL,12.5,1


In [14]:
inv_class_mapping = {v: k for k, v in class_mapping.items()}
df2['classlabel'] = df2['classlabel'].map(inv_class_mapping)
df2

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2
3,green,XL,12.5,class2


In [15]:
class_mapping.items()

dict_items([('class1', 0), ('class2', 1)])

In [16]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df2['color'].values)
y

array([1, 2, 0, 1])

In [17]:
class_le.inverse_transform(y)

array(['green', 'red', 'blue', 'green'], dtype=object)

In [18]:
X = df2.values
# one-hot encoding: avoid ML training process assume the data have some order
# transform one feture to a serious of boolean feature
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ohot = ColumnTransformer([('color', OneHotEncoder(categories='auto'), [0]), 
                          ('size', 'passthrough', [1]), 
                          ('price', 'passthrough', [2])])
ohot.fit_transform(X)

array([[0.0, 1.0, 0.0, 'M', 10.1],
       [0.0, 0.0, 1.0, 'L', 13.5],
       [1.0, 0.0, 0.0, 'XL', 15.3],
       [0.0, 1.0, 0.0, 'XL', 12.5]], dtype=object)

In [19]:
pd.get_dummies(df2[['price', 'color', 'size', 'classlabel']]) # one-hot encoding via pandas

Unnamed: 0,price,color_blue,color_green,color_red,size_L,size_M,size_XL,classlabel_class1,classlabel_class2
0,10.1,0,1,0,0,1,0,0,1
1,13.5,0,0,1,1,0,0,1,0
2,15.3,1,0,0,0,0,1,0,1
3,12.5,0,1,0,0,0,1,0,1


In [20]:
pd.get_dummies(df2[['price', 'color', 'size', 'classlabel']], drop_first=True)

Unnamed: 0,price,color_green,color_red,size_M,size_XL,classlabel_class2
0,10.1,1,0,1,0,1
1,13.5,0,1,0,0,0
2,15.3,0,0,0,1,1
3,12.5,1,0,0,1,1


In [21]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [22]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash ', 'Magnesium', 
                   'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 
                   'Hue', 'OD280/OD315 of diluted wines', 'Proline ']
df_wine

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [23]:
np.unique(df_wine['Class label'])

array([1, 2, 3])

In [24]:
X = df_wine.iloc[:, 1:].values
y = df_wine.iloc[:, 0].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [25]:
X_train.shape

(124, 13)

In [37]:
from sklearn.preprocessing import StandardScaler  # standarlization
stdsc = StandardScaler()
stdsc.fit(X_train)
X_train_std = stdsc.transform(X_train)
print(X_train.mean(axis=0), X_train.std(axis=0))
print(X_train_std.mean(axis=0), X_train_std.std(axis=0))

[1.30335484e+01 2.35379032e+00 2.38491935e+00 1.98016129e+01
 9.90887097e+01 2.32483871e+00 2.06411290e+00 3.68064516e-01
 1.64088710e+00 5.08959677e+00 9.54193548e-01 2.61935484e+00
 7.54822581e+02] [8.23368566e-01 1.16920747e+00 2.68077071e-01 3.32703937e+00
 1.40557715e+01 6.14857030e-01 1.01944562e+00 1.21181532e-01
 5.90112089e-01 2.36010810e+00 2.20779765e-01 6.92948052e-01
 3.25392246e+02]
[ 6.89009781e-15  1.68324136e-16  3.79965996e-15  2.85613826e-16
 -3.17846108e-16  1.66264851e-15  1.59818395e-16 -1.19886180e-15
  1.88827448e-15 -1.61519543e-15 -5.78390382e-16  8.63108868e-16
 -1.33405831e-16] [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [38]:
X_test_std = stdsc.transform(X_test)
print(X_test.mean(axis=0), X_test.std(axis=0))
print(X_test_std.mean(axis=0), X_test_std.std(axis=0))

[1.29250000e+01 2.29629630e+00 2.32425926e+00 1.87907407e+01
 1.01240741e+02 2.22685185e+00 1.94925926e+00 3.47592593e-01
 1.47611111e+00 4.98574074e+00 9.64925926e-01 2.59407407e+00
 7.28685185e+02] [7.71563468e-01 9.74305634e-01 2.81270937e-01 3.22893564e+00
 1.45512316e+01 6.39585584e-01 9.35222001e-01 1.29442325e-01
 5.05196759e-01 2.19502012e+00 2.43386817e-01 7.41086586e-01
 2.85374207e+02]
[-0.1318345  -0.0491735  -0.22627857 -0.30383535  0.15310658 -0.15936527
 -0.11266284 -0.168936   -0.27922828 -0.04400477  0.04861124 -0.03648291
 -0.08032581] [0.93708152 0.83330432 1.04921669 0.97051321 1.03524959 1.04021838
 0.91738292 1.06816875 0.85610305 0.93005067 1.1023964  1.06946918
 0.877016  ]


In [41]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', multi_class='ovr')
lr.fit(X_train_std, y_train)
print('training accuracy: ', lr.score(X_train_std, y_train))
print('testing accuracy: ', lr.score(X_test_std, y_test))
print('weight: ', lr.intercept_, lr.coef_)

training accuracy:  1.0
testing accuracy:  1.0
weight:  [-1.26362616 -1.21602784 -2.37002709] [[ 1.24599971  0.18057748  0.74479889 -1.16255188  0.          0.
   1.16525817  0.          0.          0.          0.          0.55224716
   2.50967287]
 [-1.53666123 -0.3874361  -0.99474649  0.36481975 -0.05952436  0.
   0.66809142  0.          0.         -1.9341457   1.23357277  0.
  -2.23261101]
 [ 0.13541033  0.16911389  0.35779583  0.          0.          0.
  -2.43460414  0.          0.          1.56285918 -0.81818522 -0.49487136
   0.        ]]
