# Introduction

In [31]:
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import metrics
import random

In [2]:
columns = ["match_id", "player_slot", "chat_key", "unit", "radiant_win"]
chat_data = pd.read_csv('match_chat.csv', names=columns, encoding = "ISO-8859-1")
chat_data.head()

Unnamed: 0,match_id,player_slot,chat_key,unit,radiant_win
0,5008188550,128,yes you are girl,5,0
1,5008188550,128,xoxo,5,0
2,5008188550,1,stop,1,0
3,5008188550,1,I cant love the enemy,1,0
4,5008188550,130,I'd paint u,7,0


In [3]:
# according to the API, player_slot 0-127 are Radiant, 128-255 are Dire
chat_data['is_radiant'] = np.where(chat_data['player_slot']<=127, 1, 0)
# create a new column call win
win_condition = (chat_data['is_radiant'] == chat_data['radiant_win'])
chat_data['win'] = np.where(win_condition, 1, 0)
chat_data.head()

Unnamed: 0,match_id,player_slot,chat_key,unit,radiant_win,is_radiant,win
0,5008188550,128,yes you are girl,5,0,0,1
1,5008188550,128,xoxo,5,0,0,1
2,5008188550,1,stop,1,0,1,0
3,5008188550,1,I cant love the enemy,1,0,1,0
4,5008188550,130,I'd paint u,7,0,0,1


In [4]:
# drop unnecessary columns
chat_data = chat_data.drop(columns=['match_id','player_slot','unit','radiant_win','is_radiant'], axis=1)
chat_data.head()

Unnamed: 0,chat_key,win
0,yes you are girl,1
1,xoxo,1
2,stop,0
3,I cant love the enemy,0
4,I'd paint u,1


# Explore dataset

In [5]:
# What is the shape of the dataset

print("Input data has {} rows and {} columns.".format(len(chat_data), len(chat_data.columns)))

Input data has 4709 rows and 2 columns.


In [6]:
# number of rows in the dataset
len(chat_data)

4709

In [7]:
# How many wins/lost are there
print("Out of {} rows, {} are wins, {} are loses".format(len(chat_data),
                                                         len(chat_data[chat_data['win']==1]),
                                                         len(chat_data[chat_data['win']==0])))

Out of 4709 rows, 2578 are wins, 2131 are loses


In [8]:
# are there any missing data

print("Number of null in win: {}".format(chat_data['win'].isnull().sum()))
print("Number of null in chat_key: {}".format(chat_data['chat_key'].isnull().sum()))

Number of null in win: 0
Number of null in chat_key: 0


# Pre-processing chat data

Remove punctuation, Tokenization, Remove stopwords

In [22]:
import nltk
import string
import re
stopword = nltk.corpus.stopwords.words('english')

def clean_chat(text):
    text_no_punct = "".join([char for char in text if char not in string.punctuation])
    token_text = re.split('\W+', text_no_punct)
    text = [word for word in token_text if word not in stopword]
    return text

# Split into train and test set

In [23]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(chat_data['chat_key'], chat_data['win'], test_size=0.33)