In [7]:
import pandas as pd

df = pd.read_csv("./data/sonar.csv")

# Mmm wrong headers
len(df.columns)

# Read it again with the correct headers
cols = ['att_' + str(c+1) for c in range(60)]+['class']
df = pd.read_csv("./data/sonar.csv", header = None, names = cols)

# scikit-learn needs numeric values (unlike, e.g. R)... 
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['class_numeric'] = le.fit_transform(df['class'])

# A different way
df['class_test'] = df['class'].apply(lambda x: 1 if x=='R' else 0)                            

# Let's get rid of it
df.drop('class_test', axis=1, inplace=True)
df.drop('class', axis=1, inplace=True)

# Show the classes
le.classes_

array(['M', 'R'], dtype=object)

In [8]:
# Compute results back
le.inverse_transform([0,1,0])

  if diff:


array(['M', 'R', 'M'], dtype=object)

In [9]:
########################
# Introducing pipelines
########################

X = df[[c for c in df.columns if c != 'class_numeric']].copy().values
y = df['class_numeric'].copy().values



In [15]:
# Recap: how would we apply logistic regression?
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

# train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42 )

lr.fit(X_train,y_train)
lr.score(X_test,y_test)


0.9038461538461539

In [16]:
'''
Easy enough... but not the full story... Sometimes we need to:
- be sure that the data is in the same scale      
'''     
from sklearn.preprocessing import StandardScaler
scl = StandardScaler()

X_train_rescaled = scl.fit_transform(X_train)
X_test_rescaled = scl.transform(X_test)

# fit the model again
lr.fit(X_train_rescaled,y_train)
lr.score(X_test_rescaled, y_test)



0.8269230769230769

In [17]:
# so, better! Sometimes it's better to have a more structured workflow

from sklearn.pipeline import Pipeline

estimator = Pipeline(steps = [('z-score',scl), 
                              ('lr', lr)])
estimator.fit(X_train,y_train)    
estimator.score(X_test,y_test)


0.8269230769230769