In [94]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Part 1

In [112]:
train_data = pd.read_csv('poker-hand-training-true.data', header = None)
train_data.columns = ['S1','C1','S2','C2','S3','C3','S4','C4','S5','C5','Label']
train_data.head()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Label
0,1,10,1,11,1,13,1,12,1,1,9.0
1,2,11,2,13,2,10,2,12,2,1,9.0
2,3,12,3,11,3,13,3,10,3,1,9.0
3,4,10,4,11,4,1,4,13,4,12,9.0
4,4,1,4,13,4,12,4,11,4,10,8.0


In [96]:
train_data.describe()

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Label
count,25010.0,25010.0,25010.0,25010.0,25010.0,25010.0,25010.0,25010.0,25010.0,25010.0,25008.0
mean,2.508756,6.995082,2.498041,7.013874,2.510236,7.014194,2.496082,6.942263,2.497521,6.962415,0.621241
std,1.116483,3.74989,1.12305,3.766975,1.123148,3.744937,1.116511,3.747224,1.11909,3.741875,0.789735
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,2.0,4.0,1.0,4.0,2.0,4.0,1.0,4.0,1.0,4.0,0.0
50%,3.0,7.0,2.0,7.0,3.0,7.0,2.0,7.0,3.0,7.0,1.0
75%,4.0,10.0,4.0,10.0,4.0,10.0,3.0,10.0,3.0,10.0,1.0
max,4.0,13.0,11.0,13.0,4.0,13.0,8.0,13.0,7.0,13.0,9.0


In [97]:
#pair-wise correlations of the features in matrix form, excluding the last column (target)
matrix = train_data.iloc[:,:-1].corr()
matrix

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5
S1,1.0,-0.010261,-0.021624,0.008352,-0.019618,-0.009961,-0.017486,0.003896,-0.024305,0.006683
C1,-0.010261,1.0,0.004133,-0.010947,-0.000401,-0.02625,0.003849,-0.014393,0.004737,-0.016932
S2,-0.021624,0.004133,1.0,-0.002777,-0.029149,-0.005522,-0.019458,0.011689,-0.010662,0.00532
C2,0.008352,-0.010947,-0.002777,1.0,-0.005388,-0.024623,-0.008653,-0.0129,0.00044,-0.016308
S3,-0.019618,-0.000401,-0.029149,-0.005388,1.0,0.017947,-0.01293,0.001851,-0.030218,-0.002059
C3,-0.009961,-0.02625,-0.005522,-0.024623,0.017947,1.0,-0.003291,-0.016277,0.003629,-0.010896
S4,-0.017486,0.003849,-0.019458,-0.008653,-0.01293,-0.003291,1.0,-0.008914,-0.017465,0.005056
C4,0.003896,-0.014393,0.011689,-0.0129,0.001851,-0.016277,-0.008914,1.0,0.002598,-0.01459
S5,-0.024305,0.004737,-0.010662,0.00044,-0.030218,0.003629,-0.017465,0.002598,1.0,-0.00346
C5,0.006683,-0.016932,0.00532,-0.016308,-0.002059,-0.010896,0.005056,-0.01459,-0.00346,1.0


# Part 2

In [98]:
def clean_dataset(df):
    """
    This function drops missing data and convert the class label column to data to integers
    """
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(int)

In [113]:
train_data = clean_dataset(train_data)

In [100]:
dupes_dict = {}
count = 0
for ind in train_data.index:
    #dealing with out of bound error
    if ind == len(train_data):
        break
    tuple1= (train_data.iloc[ind,0], train_data.iloc[ind,1])
    tuple2 = (train_data.iloc[ind,2], train_data.iloc[ind,3])
    tuple3 = (train_data.iloc[ind,4], train_data.iloc[ind,5])
    tuple4 = (train_data.iloc[ind,6], train_data.iloc[ind,7])
    tuple5 = (train_data.iloc[ind,8], train_data.iloc[ind,9])
    # order of hands doesn't matter so I used set
    set1 = set([tuple1,tuple2,tuple3,tuple4,tuple5,train_data.iloc[ind,10]])
    if set1 not in dupes_dict.values():
        dupes_dict[ind] = set1
    else:
        count += 1
        print(set1)
        
print("total number of duplicate groups is", count)

{1, (3, 1), (2, 13), (3, 13), (2, 2), (3, 11)}
{0, (3, 8), (1, 5), (3, 7), (2, 6), (4, 1)}
{1, (1, 2), (1, 4), (3, 9), (2, 11), (1, 9)}
{(4, 4), 1, (2, 3), (1, 6), (3, 2), (1, 3)}
{(4, 4), 1, (1, 2), (2, 1), (2, 10), (2, 2)}
{1, (4, 13), (2, 7), (3, 7), (1, 10), (4, 1)}
{0, (3, 4), (1, 7), (1, 13), (3, 9), (3, 12)}
{0, (2, 7), (4, 6), (3, 10), (3, 12), (4, 1)}
{0, (1, 1), (2, 3), (1, 10), (1, 6), (3, 11)}
{1, (2, 4), (1, 8), (2, 3), (4, 5), (2, 8)}
{1, (2, 7), (4, 9), (1, 7), (4, 8), (3, 2)}
{1, (3, 8), (4, 6), (1, 8), (2, 9), (4, 7)}
{0, (3, 7), (2, 3), (1, 13), (3, 6), (3, 5)}
{0, (3, 4), (1, 7), (1, 13), (3, 9), (3, 12)}
{0, (1, 2), (4, 13), (3, 4), (1, 1), (2, 3)}
{0, (2, 7), (4, 12), (3, 10), (4, 2), (2, 3)}
{(4, 4), 1, (2, 7), (1, 11), (4, 1), (4, 7)}
{1, (2, 7), (4, 5), (3, 3), (3, 6), (1, 3)}
{(4, 4), 1, (1, 4), (3, 6), (3, 12), (3, 2)}
{1, (2, 7), (1, 1), (1, 8), (2, 6), (4, 8)}
{2, (2, 7), (3, 7), (2, 6), (2, 12), (3, 6)}
{0, (3, 8), (4, 3), (1, 4), (3, 13), (1, 7)}
{(1, 12),

# Part 3

In [101]:
test_data = pd.read_csv('poker-hand-testing.data', header = None)
test_data.columns = ['S1','C1','S2','C2','S3','C3','S4','C4','S5','C5','Label']
test_data = clean_dataset(test_data)

X_train = train_data.iloc[:,:-1]
Y_train = train_data.iloc[:,-1]
X_test = test_data.iloc[:,:-1]
Y_test = test_data.iloc[:,-1]

In [102]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

alg = DecisionTreeClassifier(random_state=1, criterion='gini')
alg.fit(X_train, Y_train)
y_pred = alg.predict(X_test)
print('Accuracy score:',accuracy_score(Y_test,y_pred))

Accuracy score: 0.478449


In [103]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 200)
model.fit(X_train,Y_train)
predictions = model.predict(X_test)
print('Accuracy score:',accuracy_score(Y_test,predictions))

Accuracy score 0.619192


# Part 4

In [107]:
import itertools
def eval_hand(num_list, suit_list):
    values = sorted(num_list, reverse=True)
    straight = (values == list(range(values[0], values[0]-5, -1))
                or values == [13,12,11,10,1])
    flush = all(s == suit_list[0] for s in suit_list)
    
    if values == [13,12,11,10,1] and flush:
        return 9
    if straight and flush: 
        return 8
    if flush: 
        return 5
    if straight: 
        return 4
    
    triples = []
    pairs = []
    for v, group in itertools.groupby(values):
        count = sum(1 for _ in group)
        if count == 4: 
            return 7
        elif count == 3: 
            triples.append(v)
        elif count == 2: 
            pairs.append(v)

    if triples: 
        return (6 if pairs else 3)
    
    return len(pairs)

In [121]:
for ind in train_data.index:
    #dealing with out of bound error
    if ind == len(train_data):
        break
    num_list = [train_data.iloc[ind,1],train_data.iloc[ind,3],train_data.iloc[ind,5],train_data.iloc[ind,7],train_data.iloc[ind,9]]
    suit_list = [train_data.iloc[ind,0],train_data.iloc[ind,2],train_data.iloc[ind,4],train_data.iloc[ind,6],train_data.iloc[ind,8]]
    correct_label = eval_hand(num_list,suit_list)
    if correct_label != train_data.iloc[ind,10]:
        print(ind, train_data.iloc[ind,:].tolist(), 'Correct label should be:',correct_label)
    
    train_data.iat[ind, 10] =  correct_label

4 [4, 1, 4, 13, 4, 12, 4, 11, 4, 10, 8] Correct label should be: 9
5 [1, 2, 1, 4, 1, 5, 1, 3, 1, 6, 9] Correct label should be: 8
8465 [3, 8, 4, 8, 1, 8, 1, 12, 2, 8, 3] Correct label should be: 7
9330 [2, 9, 3, 11, 1, 11, 1, 9, 3, 9, 3] Correct label should be: 6
9581 [3, 9, 3, 10, 3, 11, 3, 1, 3, 5, 8] Correct label should be: 5
9870 [2, 9, 2, 10, 3, 6, 1, 7, 4, 8, 8] Correct label should be: 4
9922 [2, 11, 3, 2, 2, 2, 4, 5, 1, 2, 7] Correct label should be: 3
9925 [2, 3, 1, 12, 3, 12, 3, 11, 1, 3, 1] Correct label should be: 2
9928 [3, 10, 4, 11, 3, 1, 3, 11, 1, 4, 0] Correct label should be: 1
9930 [4, 10, 3, 4, 4, 13, 3, 6, 1, 1, 1] Correct label should be: 0


In [123]:
# To check if we corrected the label
train_data.head(6)

Unnamed: 0,S1,C1,S2,C2,S3,C3,S4,C4,S5,C5,Label
0,1,10,1,11,1,13,1,12,1,1,9
1,2,11,2,13,2,10,2,12,2,1,9
2,3,12,3,11,3,13,3,10,3,1,9
3,4,10,4,11,4,1,4,13,4,12,9
4,4,1,4,13,4,12,4,11,4,10,9
5,1,2,1,4,1,5,1,3,1,6,8


In [124]:
#Rebuild
new_model = RandomForestClassifier(n_estimators = 200)
new_model.fit(X_train,Y_train)

RandomForestClassifier(n_estimators=200)

In [125]:
for ind in test_data.index:
    #dealing with out of bound error
    if ind == len(test_data):
        break
    num_list = [test_data.iloc[ind,1],test_data.iloc[ind,3],test_data.iloc[ind,5],test_data.iloc[ind,7],test_data.iloc[ind,9]]
    suit_list = [test_data.iloc[ind,0],test_data.iloc[ind,2],test_data.iloc[ind,4],test_data.iloc[ind,6],test_data.iloc[ind,8]]
    correct_label = eval_hand(num_list,suit_list)
    if correct_label != test_data.iloc[ind,10]:
        print(ind, test_data.iloc[ind,:].tolist(), 'Correct label should be:',correct_label)
    
    test_data.iat[ind, 10] =  correct_label

944522 [4, 1, 4, 11, 4, 10, 4, 12, 4, 13, 5] Correct label should be: 9
991931 [4, 10, 4, 12, 4, 9, 4, 11, 4, 13, 9] Correct label should be: 8
993160 [1, 10, 3, 1, 3, 13, 2, 3, 3, 11, 1] Correct label should be: 0
993161 [2, 2, 2, 8, 2, 7, 4, 7, 3, 4, 2] Correct label should be: 1
993164 [3, 9, 2, 12, 2, 11, 4, 11, 1, 12, 6] Correct label should be: 2
993218 [2, 6, 2, 12, 3, 6, 1, 9, 1, 6, 6] Correct label should be: 3
993281 [4, 13, 3, 12, 2, 1, 2, 10, 1, 11, 0] Correct label should be: 4
993340 [3, 12, 3, 9, 3, 2, 3, 10, 3, 6, 0] Correct label should be: 5
994668 [3, 13, 4, 13, 1, 13, 2, 7, 1, 7, 7] Correct label should be: 6
994891 [4, 9, 3, 13, 2, 13, 1, 13, 4, 13, 6] Correct label should be: 7


In [126]:
predictions = model.predict(X_test)
print('Accuracy score:',accuracy_score(Y_test,predictions))

Accuracy score: 0.619193


Accuracy score was 0.619192 before, now it is 0.619193. It is barely an improvement.