In [0]:
from tqdm import tqdm
import numpy as np
import pandas as pd

import tensorflow as tf
from keras import Sequential
from keras.layers import Flatten, Dense
from keras.layers import Input
from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.models import load_model

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using TensorFlow backend.


In [0]:
df = pd.read_csv('drive/Team Drives/Deep Learning Project/ken_cnn/combined_text_dataset.csv', sep='\t')

In [0]:
if 'Unnamed: 0' in df.columns:
  df.drop('Unnamed: 0', axis=1, inplace=True) # Drop unused column
df.head(3)

Unnamed: 0,id_odsp,text,ht,at,fthg,ftag,winner
0,004f4ING/,Bafetimbi Gomis (Swansea City) wins a free kic...,Southampton,Swansea,0,1,away
1,00LMl81F/,"Offside, Milan. Kevin-Prince Boateng tries a t...",AS Roma,AC Milan,2,3,away
2,00OX4xFp/,Attempt missed. Bernardo Silva (Monaco) header...,AS Monaco,Lille,0,0,tie


In [0]:
# Show stats of dataset
home_count = df.groupby('winner')['winner'].count()['home']
away_count = df.groupby('winner')['winner'].count()['away']
tie_count = df.groupby('winner')['winner'].count()['tie']

print("Home wins: {} ==> {}%".format(home_count, home_count * 100 / df.shape[0]))
print("Away wins: {} ==> {}%".format(away_count, away_count * 100 / df.shape[0]))
print("Tie: {} ==> {}%".format(tie_count, tie_count * 100 / df.shape[0]))

Home wins: 4189 ==> 46.164866651972666%
Away wins: 2576 ==> 28.38880317390346%
Tie: 2309 ==> 25.44633017412387%


In [0]:
# Check number of unique words
unique_words = set()
max_len = -1
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
  split_text = row['text'].split()
  for t in split_text:
    unique_words.add(t)
    
  # Check max length
  if len(split_text) > max_len:
    max_len = len(split_text)

print("\nTotal number of unique words: {0}".format(len(unique_words)))
print("Max comment word length: {0}".format(max_len))

100%|██████████| 9074/9074 [00:02<00:00, 3606.33it/s]


Total number of unique words: 11201
Max comment word length: 1132





In [0]:
# num_words = len(unique_words)
num_words = 200 # Most 1000 common words

In [0]:
from keras.preprocessing.text import Tokenizer
samples = list(df.text)
# Creates a tokenizer, configured to only take into account the <num_words> most common words
tokenizer = Tokenizer(num_words=num_words)
# Building the word index
tokenizer.fit_on_texts(samples)

In [0]:
# Turns strings into lists of integer indices
sequences = tokenizer.texts_to_sequences(samples)
# sequences[0]

In [0]:
# --- Setting up constants ---
# Number of words as features, we keep only the top most-common words
max_features = 1000
# Max number of words in comments (truncate the rest)
max_len = 1132 # If not set here, will use the max comment length which is 1132

In [0]:
from keras import preprocessing
x_train = preprocessing.sequence.pad_sequences(sequences, maxlen = max_len)
x_train.shape

(9074, 1132)

In [0]:
def get_label(text):
  if text == "home":
    return [1,0,0]
  elif text == "away":
    return [0,0,1]
  else:
    return [0,0,0]

In [0]:

y_train = np.array(list(df.winner.map(get_label)))
len(y_train)

9074

In [0]:
# Set parameters
input_number = max_len # Length of input
vocabulary_size = len(unique_words)

In [0]:
# Load model
model_fp = 'drive/Team Drives/Deep Learning Project/ken_cnn/cnn_64.model'
model = load_model(model_fp)

In [0]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 1132, 100)         1120100   
_________________________________________________________________
dropout_27 (Dropout)         (None, 1132, 100)         0         
_________________________________________________________________
conv1d_72 (Conv1D)           (None, 1128, 64)          32064     
_________________________________________________________________
max_pooling1d_72 (MaxPooling (None, 282, 64)           0         
_________________________________________________________________
conv1d_73 (Conv1D)           (None, 278, 32)           10272     
_________________________________________________________________
max_pooling1d_73 (MaxPooling (None, 69, 32)            0         
_________________________________________________________________
conv1d_74 (Conv1D)           (None, 65, 16)            2576      
__________

In [0]:
# Extract output from flatten layer
layer_name = 'flatten_25'
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)
intermediate_output = intermediate_layer_model.predict(x_train)
intermediate_output.shape

(9074, 256)

In [0]:
# Save np.array to file
np.savetxt('drive/Team Drives/Deep Learning Project/ken_cnn/cnn_output.txt', intermediate_output)

In [0]:
# Load np.array from file
loaded = np.loadtxt('drive/Team Drives/Deep Learning Project/ken_cnn/cnn_output.txt')
loaded.shape

(9074, 256)