### Import Packages


In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.layers import Input, Dense, Embedding, MaxPooling1D, Conv1D, SpatialDropout1D
from keras.layers import add, Dropout, PReLU, BatchNormalization, GlobalMaxPooling1D, MaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import optimizers
from keras import initializers, regularizers, constraints, callbacks
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import *
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC, LinearSVC


### Read in datafile

In [2]:
train = pd.read_csv("TIL_NLP_train_dataset.csv")
test = pd.read_csv("TIL_NLP_test_dataset.csv")
print(train.shape)
print(test.shape)

(7380, 7)
(2460, 2)


In [3]:
train.head(3) ##Inpect

Unnamed: 0,id,word_representation,outwear,top,trousers,women dresses,women skirts
0,0,w7718 w173355 w138132 w232277 w90685 w314686 w...,1,0,1,0,0
1,1,w195317 w127737 w171593 w22890 w342007 w217871...,1,0,1,0,0
2,2,w247655 w270233 w261113 w337250 w366000 w37873...,0,1,1,0,0


In [43]:
test.head(3) ##Inpect

Unnamed: 0,id,word_representation
0,0,w373517 w383437 w374393 w87179 w289496 w327385...
1,1,w237465 w167111 w279437 w194870 w351537 w17560...
2,2,w151648 w93366 w121255 w193800 w71240 w48576 w...


### Extract input and labels

In [3]:
train_input = train["word_representation"]
train_labels = train[train.columns[2:]]
test_input = test["word_representation"]
test_id = test["id"]

print(train_input.head(1))
print(train_labels.head(1))
print(test_input.head(1))

0    w7718 w173355 w138132 w232277 w90685 w314686 w...
Name: word_representation, dtype: object
   outwear  top  trousers  women dresses  women skirts
0        1    0         1              0             0
0    w373517 w383437 w374393 w87179 w289496 w327385...
Name: word_representation, dtype: object


### Tokenise and pad

In [46]:
x_train, x_test, y_train, y_test = train_test_split(train_input, train_labels, test_size=0.15, shuffle = True, random_state = 255)

cvect = CountVectorizer(stop_words="english", analyzer='word', 
                             ngram_range=(1, 6), max_df=1.0, min_df=1, max_features=None)
cvect.fit(x_train)
x_train_vectorized = cvect.transform(x_train)
x_test_vectorized = cvect.transform(x_test)

print(x_train_vectorized.shape)
print(x_test_vectorized.shape)
print(y_train.shape)

(6273, 82555)
(1107, 82555)
(6273, 5)


### Using LinearSVC

In [34]:
clf1 = LinearSVC(C=0.32, penalty="l2")
clf2 = LinearSVC(C=0.32, penalty="l2")
clf3 = LinearSVC(C=0.32, penalty="l2")
clf4 = LinearSVC(C=0.32, penalty="l2")
clf5 = LinearSVC(C=0.32, penalty="l2")

"outwear", "top", "trousers", "women dresses", "women skirts"
clf1.fit(x_train_vectorized, y_train["outwear"])
clf2.fit(x_train_vectorized, y_train["top"])
clf3.fit(x_train_vectorized, y_train["trousers"])
clf4.fit(x_train_vectorized, y_train["women dresses"])
clf5.fit(x_train_vectorized, y_train["women skirts"])


y_pred1 = clf1.predict(x_test_vectorized)
y_pred2 = clf2.predict(x_test_vectorized)
y_pred3 = clf3.predict(x_test_vectorized)
y_pred4 = clf4.predict(x_test_vectorized)
y_pred5 = clf5.predict(x_test_vectorized)

y_pred = np.stack((y_pred1,y_pred2,y_pred3, y_pred4, y_pred5), axis = 1)

from sklearn.metrics import accuracy_score 
print("Accuracy score using CountVectorizer and LinearSVC: {}".format(accuracy_score(y_pred, y_test)))

Accuracy score using CountVectorizer and LinearSVC: 0.9214092140921409


### Using SGDClassifier

In [45]:
from sklearn.linear_model import SGDClassifier

clf1 = SGDClassifier(alpha = 0.2)
clf2 = SGDClassifier(alpha = 0.2)
clf3 = SGDClassifier(alpha = 0.2)
clf4 = SGDClassifier(alpha = 0.2)
clf5 = SGDClassifier(alpha = 0.2)

"outwear", "top", "trousers", "women dresses", "women skirts"
clf1.fit(x_train_vectorized, y_train["outwear"])
clf2.fit(x_train_vectorized, y_train["top"])
clf3.fit(x_train_vectorized, y_train["trousers"])
clf4.fit(x_train_vectorized, y_train["women dresses"])
clf5.fit(x_train_vectorized, y_train["women skirts"])


y_pred1 = clf1.predict(x_test_vectorized)
y_pred2 = clf2.predict(x_test_vectorized)
y_pred3 = clf3.predict(x_test_vectorized)
y_pred4 = clf4.predict(x_test_vectorized)
y_pred5 = clf5.predict(x_test_vectorized)

y_pred = np.stack((y_pred1,y_pred2,y_pred3, y_pred4, y_pred5), axis = 1)

from sklearn.metrics import accuracy_score 
print("Accuracy score using CountVectorizer and SGD: {}".format(accuracy_score(y_pred, y_test)))

Accuracy score using CountVectorizer and LinearSVC: 0.5338753387533876


### Using Logistic Regression


In [50]:
from sklearn.linear_model import LogisticRegression

clf1 = LogisticRegression(C=0.25, max_iter = 1000, solver = 'lbfgs')
clf2 = LogisticRegression(C=0.25, max_iter = 1000, solver = 'lbfgs')
clf3 = LogisticRegression(C=0.25, max_iter = 1000, solver = 'lbfgs')
clf4 = LogisticRegression(C=0.25, max_iter = 1000, solver = 'lbfgs')
clf5 = LogisticRegression(C=0.25, max_iter = 1000, solver = 'lbfgs')

"outwear", "top", "trousers", "women dresses", "women skirts"
clf1.fit(x_train_vectorized, y_train["outwear"])
clf2.fit(x_train_vectorized, y_train["top"])
clf3.fit(x_train_vectorized, y_train["trousers"])
clf4.fit(x_train_vectorized, y_train["women dresses"])
clf5.fit(x_train_vectorized, y_train["women skirts"])


y_pred1 = clf1.predict(x_test_vectorized)
y_pred2 = clf2.predict(x_test_vectorized)
y_pred3 = clf3.predict(x_test_vectorized)
y_pred4 = clf4.predict(x_test_vectorized)
y_pred5 = clf5.predict(x_test_vectorized)

y_pred = np.stack((y_pred1,y_pred2,y_pred3, y_pred4, y_pred5), axis = 1)

from sklearn.metrics import accuracy_score 
print("Accuracy score using CountVectorizer and SGD: {}".format(accuracy_score(y_pred, y_test)))

Accuracy score using CountVectorizer and SGD: 0.9024390243902439


### LSTM

In [51]:
vectorizer = CountVectorizer(ngram_range=(1, 6), token_pattern=r'\w{1,}', stop_words = None, min_df = 1)
X_train_onehot = vectorizer.fit_transform(x_train)

model = Sequential()
model.add(Dense(units=500, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Conv1D(512, 5, activation='relu'))
model.add(Dropout(0.5)(x))

model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

ValueError: Input 0 of layer bidirectional is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 500]

### Preparing for submission

In [39]:
cvect = CountVectorizer(stop_words="english", analyzer='word', 
                             ngram_range=(1, 3), max_df=1.0, min_df=1, max_features=None)
cvect.fit(train_input)
x_train_vectorized = cvect.transform(train_input)
x_test_vectorized = cvect.transform(test_input)

print(x_train_vectorized.shape)
print(x_test_vectorized.shape)
print(train_labels.shape)

(7380, 92095)
(2460, 92095)
(7380, 5)


In [40]:
clf1 = LinearSVC(C=0.32, penalty="l2")
clf2 = LinearSVC(C=0.32, penalty="l2")
clf3 = LinearSVC(C=0.32, penalty="l2")
clf4 = LinearSVC(C=0.32, penalty="l2")
clf5 = LinearSVC(C=0.32, penalty="l2")

## "outwear", "top", "trousers", "women dresses", "women skirts"
clf1.fit(x_train_vectorized, train_labels["outwear"])
clf2.fit(x_train_vectorized, train_labels["top"])
clf3.fit(x_train_vectorized, train_labels["trousers"])
clf4.fit(x_train_vectorized, train_labels["women dresses"])
clf5.fit(x_train_vectorized, train_labels["women skirts"])


y_pred1 = clf1.predict(x_test_vectorized)
y_pred2 = clf2.predict(x_test_vectorized)
y_pred3 = clf3.predict(x_test_vectorized)
y_pred4 = clf4.predict(x_test_vectorized)
y_pred5 = clf5.predict(x_test_vectorized)

y_pred = np.stack((y_pred1,y_pred2,y_pred3, y_pred4, y_pred5), axis = 1)

In [42]:
labels = pd.DataFrame(y_pred, columns = ["outwear", "top", "trousers", "women dresses", "women skirts"])
output = pd.concat([test_id , labels], axis = 1)
output.head(3)
output.to_csv('submission3.csv',index = False)