In [1]:
import pandas as pd
import sklearn
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/skirt_dress_length.csv")

In [3]:
df = df[~df['more_info'].isnull()]
df = df[~df['is_dress'].isnull()]
df = df[~df['is_mini'].isnull()]
df['is_dress'] = df['is_dress'].astype(int)
df['is_mini'] = df['is_mini'].astype(int)

In [4]:
train, test = train_test_split(df, test_size=0.2)

In [5]:
train.shape

(319, 3)

In [6]:
test.shape

(80, 3)

In [7]:
test.head()

Unnamed: 0,more_info,is_dress,is_mini
270,Pull-On Skirt\n\nStyle up the office or your w...,0,1
281,This product is FINAL SALE. Returns and exchan...,0,1
304,"Flawlessly contoured, this pencil skirt evokes...",0,0
289,Marc by Marc Jacobs Edie Pleather Panel Skirt...,0,1
343,Galaxy Ten Capsule Collection. \r\n\r\nCelebra...,0,0


In [8]:
train_text = train['more_info']
test_text = test['more_info']

### Fit a CountVectorizer to the training set and use it to transform both the training and test set

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cvec = CountVectorizer(lowercase=False)
x_train = cvec.fit_transform(train_text.astype('U'))
x_test = cvec.transform(test_text.astype('U'))

### Fit a LogisticRegression model to the training set with is_dress as the target variable

In [10]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, train['is_dress'])

LogisticRegression()

Run predictions on the test set

In [11]:
dress_predicted = logisticRegr.predict(x_test)

### Evaluate model on the test set with is_dress as the target variable

In [12]:
from sklearn import metrics
print(metrics.f1_score(test['is_dress'], dress_predicted))

0.9230769230769231


Repeat with is_mini

In [13]:
logisticRegr.fit(x_train, train['is_mini'])
mini_predicted = logisticRegr.predict(x_test)
print(metrics.f1_score(test['is_mini'], mini_predicted))

0.6753246753246753


### Wrap the LogisticRegression model in a MultiOutputClassifier object

In [14]:
from sklearn.multioutput import MultiOutputClassifier
multi_lr = MultiOutputClassifier(logisticRegr)

In [15]:
y_train = train.drop(labels = ['more_info'], axis = 1)
y_test = test.drop(labels = ['more_info'], axis = 1)

### Fit the result to the training set with both is_dress and is_mini as target variables

In [16]:
multi_lr.fit(x_train, y_train)

MultiOutputClassifier(estimator=LogisticRegression())

In [17]:
y_hat = multi_lr.predict(x_test)

In [18]:
y_test = y_test.to_numpy()

In [19]:
metrics.f1_score(y_test[:, 0], y_hat[:,0])

0.9230769230769231

In [20]:
metrics.f1_score(y_test[:, 1], y_hat[:,1])

0.6753246753246753