<h1>1. Importing packages</h1>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle

<h1>2. Loading the dataset</h1>

In [1]:
project_id = 'stalwart-veld-364418'
!gcloud config set project {project_id}

Updated property [core/project].


Updates are available for some Google Cloud CLI components.  To install them,
please run:
  $ gcloud components update



To take a quick anonymous survey, run:
  $ gcloud survey



In [None]:
# Download the file from a given Google Cloud Storage bucket.
!gsutil cp gs://lewagon-jbaccarin-bucket/preproc_dataset/preproc_data.csv /content/preproc_data.csv

Copying gs://lewagon-jbaccarin-bucket/preproc_dataset/preproc_data.csv...
/ [1 files][267.2 MiB/267.2 MiB]                                                
Operation completed over 1 objects/267.2 MiB.                                    


In [None]:
data = pd.read_csv('preproc_data.csv')
data.shape

NameError: ignored

<h1>3. Splitting and vectorizing</h1>

In [None]:
y = data['username']
X = data["code_source"]

In [None]:
lb = LabelEncoder()
y = lb.fit_transform(y)

In [None]:
# train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

<h3>3.1. TF-IDF on text data:</h3>


In [None]:
tfidf = TfidfVectorizer(binary=True, max_features=2500, token_pattern='([\w]+|[\s]+|.+)') # token_pattern='[^ ()]+' , ngram_range=(1,5)
def tfidf_features(txt, flag):
    if flag == "train":
        x = tfidf.fit_transform(txt)
    else:
        x = tfidf.transform(txt)
    x = x.astype('float64')
    return x 
X_train = tfidf_features(X_train, flag="train").todense()
X_test = tfidf_features(X_test, flag="test").todense()

<h3>3.2. Making Target as Categorical</h3>

In [None]:
y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

<h1>4. Building the model</h1>


In [None]:
y_train.shape[1] #authors in train data

2114

In [None]:
model = keras.Sequential()

model.add(keras.layers.Conv1D(128, kernel_size=3, activation='relu',input_shape=[X_train.shape[1],1]))
model.add(keras.layers.Dropout(0.6))

model.add(keras.layers.Conv1D(128, kernel_size=4, activation='relu'))
model.add(keras.layers.Dropout(0.6))

model.add(keras.layers.Conv1D(128, kernel_size=5, activation='relu'))
model.add(keras.layers.Dropout(0.6))

# model.add(keras.layers.Conv1D(256, kernel_size=9, activation='relu'))
# model.add(keras.layers.MaxPool1D(pool_size = (4)))
# model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(y_train.shape[1], activation="softmax"))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

<h1>5. Training the model</h1>
<p>Don't forget to turn on GPU</p>

In [None]:
from tensorflow.keras import callbacks
es = callbacks.EarlyStopping(patience=5, restore_best_weights=True)

history = model.fit(X_train,
                  y_train,
                  epochs=200,
                  batch_size=64,
                  verbose=1,
                  validation_split=0.2,
                  callbacks=[es])

<h1>6. Plotting learning curves</h1>
Learning curves show us overting/underfiting

In [None]:
print(history.history.keys())
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


<h1>7. Evaluating model and predicting</h1>

In [None]:
eval = model.evaluate(X_test, y_test)
eval

In [None]:
predictions_encoded = model.predict(X_test)
predictions_encoded.shape

<h3>7.1. Converting predicted vectors to names of authors</h13>

In [None]:
predictions = lb.inverse_transform([np.argmax(pred) for pred in predictions_encoded])
predictions

<h1>8. Exporting Pickles</h1>

In [None]:
pickle.dump(lb, open('./cnn_labelenc.pkl', 'wb')) #label encoder
pickle.dump(tfidf, open('./cnn_tfidf.pkl', 'wb')) #tfidf
pickle.dump(model, open('./cnn_model.pkl', 'wb')) #model

In [None]:
!gsutil cp cnn_tfidf.pkl gs://lewagon-jbaccarin-bucket/models/
!gsutil cp cnn_labelenc.pkl gs://lewagon-jbaccarin-bucket/models/
!gsutil cp cnn_model.pkl gs://lewagon-jbaccarin-bucket/models/

CommandException: No URLs matched: /tmp/to_upload.txt
