In [1]:
import random
import re

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow import keras

### Task 1

In [2]:
names = ["Wine type", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", "Total phenols", "Flavanoids", 
         "Nonflavanoid phenols", "Proanthocyanins", "Color intensity", "Hue", "OD280", "Proline"]
df = pd.read_csv("./wine.data", index_col=False, names=names)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 14 columns):
Wine type               178 non-null int64
Alcohol                 178 non-null float64
Malic acid              178 non-null float64
Ash                     178 non-null float64
Alcalinity of ash       178 non-null float64
Magnesium               178 non-null int64
Total phenols           178 non-null float64
Flavanoids              178 non-null float64
Nonflavanoid phenols    178 non-null float64
Proanthocyanins         178 non-null float64
Color intensity         178 non-null float64
Hue                     178 non-null float64
OD280                   178 non-null float64
Proline                 178 non-null int64
dtypes: float64(11), int64(3)
memory usage: 19.6 KB


In [3]:
# normalization, train-test split and one hot encoding
for column in names[1:]:
    df[column] = (df[column] - df[column].mean())/df[column].std()
Y = df.iloc[:, 0].values.reshape(-1, 1)
Y = OneHotEncoder().fit_transform(Y).toarray()
X = df.iloc[:, 1:].values
num_samples = X.shape[0]
train_samples = int(0.75*num_samples)
indexes = np.arange(num_samples)
random.shuffle(indexes)
X, Y = X[indexes], Y[indexes]
X_train, Y_train = X[:train_samples, :], Y[:train_samples]
X_test, Y_test = X[train_samples:, :], Y[train_samples:]
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(133, 13) (133, 3)
(45, 13) (45, 3)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [4]:
# define model
inputs = keras.Input(shape=(13,))
x = keras.layers.Dense(64, activation="relu")(inputs)
outputs = keras.layers.Dense(3, activation="softmax")(x)
fc_model = keras.Model(inputs=inputs, outputs=outputs, name="fc_model")
fc_model.summary()

Model: "fc_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 13)]              0         
_________________________________________________________________
dense (Dense)                (None, 64)                896       
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 195       
Total params: 1,091
Trainable params: 1,091
Non-trainable params: 0
_________________________________________________________________


In [5]:
fc_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
# training
fc_model.fit(X_train, Y_train, batch_size=4, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1880ed3cd88>

In [6]:
# evaluation
test_scores = fc_model.evaluate(X_test, Y_test, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

2/2 - 0s - loss: 0.6072 - accuracy: 0.9333
Test loss: 0.6072059869766235
Test accuracy: 0.9333333373069763


### Task 2

In [7]:
(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()

In [8]:
# define model
inputs = keras.Input(shape=(28, 28, 1))
x = keras.layers.Conv2D(64, 3, activation="relu")(inputs)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Flatten()(x)
outputs = keras.layers.Dense(10)(x)
cnn_model = keras.Model(inputs=inputs, outputs=outputs, name="cnn_model")
cnn_model.summary()

Model: "cnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 28, 28, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 26, 26, 64)        640       
_________________________________________________________________
batch_normalization (BatchNo (None, 26, 26, 64)        256       
_________________________________________________________________
flatten (Flatten)            (None, 43264)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                432650    
Total params: 433,546
Trainable params: 433,418
Non-trainable params: 128
_________________________________________________________________


In [9]:
cnn_model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
# training
cnn_model.fit(x_train, y_train, batch_size=256, epochs=5, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1881148ffc8>

In [10]:
# evaluation
test_scores = cnn_model.evaluate(x_test, y_test, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

313/313 - 2s - loss: 0.6729 - accuracy: 0.8803
Test loss: 0.6728541254997253
Test accuracy: 0.880299985408783


### Task 3

In [11]:
df = pd.read_csv("./twits_classification.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [12]:
def delete_stopwords(str_x):
    words = str_x.split(' ')
    neutral_words = ['people', 'wikipedia', 'one', 'say', 'page', 'know', 'go', 'back', 'take', 'see', 'look', 'article',
                     'edit', 'got', 'thing', 'want', 'make']
    new_words = list()
    for word in words:
        if word not in stopwords.words('english') or word not in neutral_words:
            new_words.append(word)
    return ' '.join(new_words)

# delete all unnecessary symbols
df["comment_text"] = df["comment_text"].map(lambda x: re.sub(r'[^\w]', ' ', x))
# lower all words
df["comment_text"] = df["comment_text"].map(lambda x: x.lower())
# delete all stopwords
df["comment_text"] = df["comment_text"].map(delete_stopwords)

In [13]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["comment_text"].values).toarray()
Y = df.iloc[:, 3:].values
num_samples = X.shape[0]
train_samples = int(0.75*num_samples)
indexes = np.arange(num_samples)
random.shuffle(indexes)
X, Y = X[indexes], Y[indexes]
x_train, y_train = X[:train_samples, :], Y[:train_samples]
x_test, y_test = X[train_samples:, :], Y[train_samples:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(750, 9374) (750, 6)
(250, 9374) (250, 6)


In [14]:
# define model
inputs = keras.Input(shape=(9374, 1))
x = keras.layers.LSTM(128)(inputs)
outputs = keras.layers.Dense(6, activation="softmax")(x)
rnn_model = keras.Model(inputs=inputs, outputs=outputs, name="rnn_model")
rnn_model.summary()

Model: "rnn_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 9374, 1)]         0         
_________________________________________________________________
lstm (LSTM)                  (None, 128)               66560     
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 774       
Total params: 67,334
Trainable params: 67,334
Non-trainable params: 0
_________________________________________________________________


In [15]:
rnn_model.compile(
    loss=keras.losses.CategoricalCrossentropy(from_logits=True),
    optimizer=keras.optimizers.RMSprop(),
    metrics=["accuracy"],
)
# training
rnn_model.fit(x_train, y_train, batch_size=4, epochs=3, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x18817d5f1c8>

In [16]:
# evaluation
test_scores = rnn_model.evaluate(x_test, y_test, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

8/8 - 22s - loss: 0.3257 - accuracy: 0.9960
Test loss: 0.325701504945755
Test accuracy: 0.9959999918937683
