In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd

import tensorflow as tf
from keras import Sequential
from keras.layers import Flatten, Dense
from keras.layers import Input
from keras.layers import LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.models import Model

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Using TensorFlow backend.


## Read combined text dataset

In [0]:
df = pd.read_csv('drive/Team Drives/Deep Learning Project/ken_cnn/combined_text_dataset.csv', sep='\t')

In [3]:
if 'Unnamed: 0' in df.columns:
  df.drop('Unnamed: 0', axis=1, inplace=True) # Drop unused column
df.head(3)

Unnamed: 0,id_odsp,text,ht,at,fthg,ftag,winner
0,004f4ING/,Bafetimbi Gomis (Swansea City) wins a free kic...,Southampton,Swansea,0,1,away
1,00LMl81F/,"Offside, Milan. Kevin-Prince Boateng tries a t...",AS Roma,AC Milan,2,3,away
2,00OX4xFp/,Attempt missed. Bernardo Silva (Monaco) header...,AS Monaco,Lille,0,0,tie


In [4]:
# Show stats of dataset
home_count = df.groupby('winner')['winner'].count()['home']
away_count = df.groupby('winner')['winner'].count()['away']
tie_count = df.groupby('winner')['winner'].count()['tie']

print("Home wins: {} ==> {}%".format(home_count, home_count * 100 / df.shape[0]))
print("Away wins: {} ==> {}%".format(away_count, away_count * 100 / df.shape[0]))
print("Tie: {} ==> {}%".format(tie_count, tie_count * 100 / df.shape[0]))

Home wins: 4189 ==> 46.164866651972666%
Away wins: 2576 ==> 28.38880317390346%
Tie: 2309 ==> 25.44633017412387%


## One-hot Encoding

In [5]:
# Check number of unique words
unique_words = set()
max_len = -1
for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
  split_text = row['text'].split()
  for t in split_text:
    unique_words.add(t)
    
  # Check max length
  if len(split_text) > max_len:
    max_len = len(split_text)

print("\nTotal number of unique words: {0}".format(len(unique_words)))
print("Max comment word length: {0}".format(max_len))

100%|██████████| 9074/9074 [00:02<00:00, 3579.98it/s]


Total number of unique words: 11201
Max comment word length: 1132





In [0]:
# num_words = len(unique_words)
num_words = 200 # Most 1000 common words

In [0]:
import pickle
from keras.preprocessing.text import Tokenizer
samples = list(df.text)
# Creates a tokenizer, configured to only take into account the <num_words> most common words
tokenizer = Tokenizer(num_words=num_words)
# Building the word index
tokenizer.fit_on_texts(samples)

In [0]:
# Turns strings into lists of integer indices
sequences = tokenizer.texts_to_sequences(samples)
# sequences[0]

In [0]:
# Turns string into binary vector of dim 1000 (based on word limit above)
# one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')
# pd.DataFrame(one_hot_results).head(3)

In [10]:
# Dictionary mapping of words to one-hot-encoded index value
word_index = tokenizer.word_index
print('Found {} unique tokens'.format(len(word_index)))
print('The dictionary mapping of tokens is\n {}'.format(word_index))

Found 6297 unique tokens
The dictionary mapping of tokens is
 {'the': 1, 'by': 2, 'a': 3, 'foul': 4, 'kick': 5, 'free': 6, 'wins': 7, 'is': 8, 'in': 9, 'right': 10, 'from': 11, 'box': 12, 'attempt': 13, 'shot': 14, 'footed': 15, 'left': 16, 'half': 17, 'assisted': 18, 'corner': 19, 'of': 20, 'defensive': 21, 'blocked': 22, 'saved': 23, 'centre': 24, 'outside': 25, 'conceded': 26, 'offside': 27, 'missed': 28, 'to': 29, 'but': 30, 'on': 31, 'ball': 32, 'with': 33, 'wing': 34, 'attacking': 35, 'misses': 36, 'goal': 37, 'through': 38, 'caught': 39, 'tries': 40, 'fc': 41, 'cross': 42, 'side': 43, 'de': 44, 'real': 45, 'high': 46, 'bottom': 47, 'header': 48, '1': 49, 'following': 50, 'card': 51, 'shown': 52, 'yellow': 53, 'close': 54, 'city': 55, 'for': 56, 'milan': 57, 'bad': 58, 'madrid': 59, 'too': 60, 'sv': 61, 'borussia': 62, '0': 63, 'and': 64, 'top': 65, 'united': 66, 'wide': 67, '04': 68, 'cf': 69, 'substitution': 70, 'hand': 71, 'replaces': 72, 'manchester': 73, 'juventus': 74, 'eti

## Feature Preparation

In [0]:
# --- Setting up constants ---
# Number of words as features, we keep only the top most-common words
max_features = 1000
# Max number of words in comments (truncate the rest)
max_len = 1132 # If not set here, will use the max comment length which is 1132

In [12]:
from keras import preprocessing
x_train = preprocessing.sequence.pad_sequences(sequences, maxlen = max_len)
x_train.shape

(9074, 1132)

In [0]:
# # --- Use one-hot encode ---
# one_hot_x_train = []
# for seq in tqdm(x_train, total=x_train.shape[0]):
#   feature_vector = []
#   for i in seq:
#     feature_vector += list(one_hot_results[i])
#   one_hot_x_train.append(feature_vector)

In [0]:
# one_hot_x_train = pd.DataFrame(one_hot_x_train)
# one_hot_x_train.shape

In [0]:
# one_hot_x_train[0]

In [0]:
def get_label(text):
  if text == "home":
    return [1,0,0]
  elif text == "away":
    return [0,0,1]
  elif text == 'tie':
    return [0,1,0]

In [17]:

y_train = np.array(list(df.winner.map(get_label)))
len(y_train)

9074

## Simple ANN model

In [0]:
# Set parameters
# input_number = num_words * max_len # Length of one-hot vector

In [0]:
# # --- Setting up a Sigmoid Sequential Model ---
# # Initialize model
# model = Sequential()
# # Adds a densely-connected layer with input_number units to the model:
# model.add(Dense(32, input_shape = (input_number,), activation='sigmoid'))
# # Add another:
# model.add(Dense(64, activation='relu'))
# # Add another:
# model.add(Dense(32, activation='relu'))
# # Add a softmax layer with 3 output units:
# model.add(Dense(3, activation='softmax'))
# # Check model summary
# model.summary()

In [0]:
# model.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])

In [0]:
# model.fit(one_hot_x_train, y_train,
#                    epochs = 2,
#                    batch_size = 64,
#                    validation_split = 0.2)

## Top team to be test

In [0]:
top_num = 1

In [0]:
import operator
from tqdm import trange

def get_top_win_teams(df):
  win_counts = dict()
  
  for idx in trange(df.shape[0]):
    ht = df.iloc[idx]['ht']
    at = df.iloc[idx]['at']
    winner = df.iloc[idx]['winner']
    
    if winner == 'home':
      winner_team = ht
    elif winner == 'away':
      winner_team = at
    else:
      continue
      
    if winner_team in win_counts:
      win_counts[winner_team] += 1
    else:
      win_counts[winner_team] = 1
      
  sorted_win_counts = sorted(win_counts.items(), key=operator.itemgetter(1), reverse=True)
    
  return [t for t, score in sorted_win_counts]

In [24]:
top_team_names = get_top_win_teams(df)
top_team_names = top_team_names[:top_num] # Top 10

100%|██████████| 9074/9074 [00:05<00:00, 1576.01it/s]


In [0]:
def get_top_team_won(df, top_team_names):
  indexes = []
  for i in trange(df.shape[0]):
    ht = df.iloc[i]['ht']
    at = df.iloc[i]['at']
    if (ht in top_team_names) or (at in top_team_names):
      indexes.append(i)
    
  return df.iloc[indexes]

In [26]:
top_team_df = get_top_team_won(df, top_team_names)

100%|██████████| 9074/9074 [00:03<00:00, 2357.21it/s]


In [27]:
test_sequences = tokenizer.texts_to_sequences(list(top_team_df.text))
x_test = preprocessing.sequence.pad_sequences(test_sequences, maxlen=max_len)
x_test.shape

(201, 1132)

In [28]:
y_test = pd.DataFrame([ get_label(winner.strip()) for winner in top_team_df['winner'] ])
y_test.shape

(201, 3)

## CNN model

In [0]:
# Set parameters
input_number = max_len # Length of input
vocabulary_size = len(unique_words)

In [30]:
# --- Setting up a Sigmoid Sequential Model ---
# Initialize model
model_conv = Sequential()
model_conv.add(Embedding(vocabulary_size, 256, input_length=input_number))
model_conv.add(Dropout(0.2))
model_conv.add(Conv1D(64, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(Conv1D(32, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(Conv1D(16, 5, activation='relu'))
model_conv.add(MaxPooling1D(pool_size=4))
model_conv.add(Flatten())
model_conv.add(Dense(3, activation='softmax'))
model_conv.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1132, 256)         2867456   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1132, 256)         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1128, 64)          81984     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 282, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 278, 32)           10272     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 69, 32)            0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 65, 16)            2576      
__________

In [0]:
model_conv.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [32]:
model_conv.fit(x_train, y_train,
                   epochs = 7,
                   batch_size = 64,
                   validation_data=(x_test, y_test))

# validation_split = 0.2, it gives val_acc=0.7085 and val_loss=0.5091

Train on 9074 samples, validate on 201 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7fc3199acf28>

In [0]:
model_fp = 'drive/Team Drives/Deep Learning Project/ken_cnn/cnn_64_fixed.model'
# model_conv.save(model_fp)