# 2. Data preparation for Clustering
For our clustering analysis we need to prepare the data accordingly. For the purpose of this we want to try to cluster on the profit margin of participants and also cluster on the studies participants were a part of also. To do this we need to create appropriate CSV files that we can then use for clustering. 

In [1]:
import pandas as pd
import seaborn as sn
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score

ModuleNotFoundError: No module named 'pandas'

In [3]:
index95 = pd.read_csv('data/index_95.csv')
index100 = pd.read_csv('data/index_100.csv')
index150 = pd.read_csv('data/index_150.csv')
win95 = pd.read_csv('data/wi_95.csv')
win100 = pd.read_csv('data/wi_100.csv')
win150 = pd.read_csv('data/wi_150.csv')
loss95 = pd.read_csv('data/lo_95.csv')
loss100 = pd.read_csv('data/lo_100.csv')
loss150 = pd.read_csv('data/lo_150.csv')
choice95 = pd.read_csv('data/choice_95.csv')
choice100 = pd.read_csv('data/choice_100.csv')
choice150 = pd.read_csv('data/choice_150.csv')

### Creating margin csv files

In [4]:
columnnames95 = [f'Trial{num}' for num in range(1,96)]
wins95 = win95
wins95 = wins95.set_axis(columnnames95, axis=1)
wins95.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial86,Trial87,Trial88,Trial89,Trial90,Trial91,Trial92,Trial93,Trial94,Trial95
Subj_1,100,100,100,100,100,100,100,100,100,100,...,50,50,50,50,50,50,50,50,50,50
Subj_2,100,100,50,100,100,100,100,100,100,100,...,50,100,100,100,100,100,50,50,50,50
Subj_3,50,50,50,100,100,100,100,100,100,100,...,100,100,100,50,50,50,50,50,50,50
Subj_4,50,50,100,100,100,100,100,50,100,100,...,100,50,50,50,50,50,50,50,50,50
Subj_5,100,100,50,50,50,100,100,100,100,100,...,50,50,50,50,50,50,50,50,50,50


In [5]:
losses95 = loss95
losses95 = losses95.set_axis(columnnames95, axis=1)
losses95.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial86,Trial87,Trial88,Trial89,Trial90,Trial91,Trial92,Trial93,Trial94,Trial95
Subj_1,0,0,0,0,0,0,0,0,-1250,0,...,0,0,0,0,0,0,0,-250,0,0
Subj_2,0,0,0,0,0,0,0,0,0,0,...,-50,-300,0,-350,0,0,0,0,0,-25
Subj_3,0,0,0,0,0,0,0,-150,0,0,...,0,0,0,0,0,0,-250,0,0,0
Subj_4,0,0,0,0,-150,0,0,0,0,0,...,0,-50,0,-50,-50,0,-25,0,0,0
Subj_5,0,0,0,0,0,0,-150,0,0,0,...,-75,0,0,0,0,0,0,0,0,0


In [6]:
df95_sum = wins95.add(losses95, fill_value=0)
df95_sum.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial86,Trial87,Trial88,Trial89,Trial90,Trial91,Trial92,Trial93,Trial94,Trial95
Subj_1,100,100,100,100,100,100,100,100,-1150,100,...,50,50,50,50,50,50,50,-200,50,50
Subj_2,100,100,50,100,100,100,100,100,100,100,...,0,-200,100,-250,100,100,50,50,50,25
Subj_3,50,50,50,100,100,100,100,-50,100,100,...,100,100,100,50,50,50,-200,50,50,50
Subj_4,50,50,100,100,-50,100,100,50,100,100,...,100,0,50,0,0,50,25,50,50,50
Subj_5,100,100,50,50,50,100,-50,100,100,100,...,-25,50,50,50,50,50,50,50,50,50


In [7]:
profit95 = df95_sum.sum(axis=1)
profit95df = pd.DataFrame(data=profit95)
profit95df.rename(columns={0: 'Margin'}, inplace=True)
profit95df.head()

Unnamed: 0,Margin
Subj_1,1150
Subj_2,-675
Subj_3,-750
Subj_4,-525
Subj_5,100


In [8]:
choice95.head()

Unnamed: 0,Choice_1,Choice_2,Choice_3,Choice_4,Choice_5,Choice_6,Choice_7,Choice_8,Choice_9,Choice_10,...,Choice_86,Choice_87,Choice_88,Choice_89,Choice_90,Choice_91,Choice_92,Choice_93,Choice_94,Choice_95
Subj_1,2,2,2,2,2,2,2,2,2,1,...,4,4,4,4,4,4,4,4,4,4
Subj_2,1,2,3,2,2,2,2,2,2,2,...,3,1,1,1,2,2,3,4,4,3
Subj_3,3,4,3,2,2,1,1,1,1,2,...,2,2,2,4,4,4,4,4,4,4
Subj_4,4,3,1,1,1,2,2,3,2,2,...,2,3,3,3,3,3,3,4,4,4
Subj_5,1,2,3,4,3,1,1,2,2,2,...,3,3,4,4,3,4,4,4,4,4


In [9]:
mode95 = choice95.mode(axis=1)
mode95.rename(columns={0: 'Most Common Choice'}, inplace=True)
mode95.head()

Unnamed: 0,Most Common Choice
Subj_1,4
Subj_2,4
Subj_3,4
Subj_4,4
Subj_5,4


In [12]:
profit95df['Most Common Choice'] = mode95['Most Common Choice'].values

In [13]:
profit95df['Study'] = index95['Study'].values
profit95df.head()

Unnamed: 0,Margin,Study,Average Choice,Most Common Choice
Subj_1,1150,Fridberg,3.4,4
Subj_2,-675,Fridberg,2.568421,4
Subj_3,-750,Fridberg,2.778947,4
Subj_4,-525,Fridberg,2.810526,4
Subj_5,100,Fridberg,3.021053,4


In [14]:
mean95 = choice95.mean(axis=1)
mean95df = pd.DataFrame(data=mean95)
mean95df.rename(columns={0: 'Average Choice'}, inplace=True)
profit95df['Average Choice'] = mean95df['Average Choice'].values
profit95df.head()

Unnamed: 0,Margin,Study,Average Choice,Most Common Choice
Subj_1,1150,Fridberg,3.4,4
Subj_2,-675,Fridberg,2.568421,4
Subj_3,-750,Fridberg,2.778947,4
Subj_4,-525,Fridberg,2.810526,4
Subj_5,100,Fridberg,3.021053,4


In [74]:
profit95df.to_csv('Data/cleaned95.csv')

## We now do this for the 100 trial and 150 trial experiments

In [15]:
columnnames100 = [f'Trial{num}' for num in range(1,101)]
wins100 = win100
wins100 = wins100.set_axis(columnnames100, axis=1)
wins100.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial91,Trial92,Trial93,Trial94,Trial95,Trial96,Trial97,Trial98,Trial99,Trial100
Subj_1,100,100,100,50,50,100,100,100,50,100,...,100,100,100,100,100,100,50,100,50,100
Subj_2,100,100,50,50,50,100,50,100,100,100,...,50,100,50,50,100,50,50,100,100,50
Subj_3,50,100,50,100,50,100,50,50,50,50,...,50,100,100,50,100,100,100,50,100,100
Subj_4,50,50,50,100,100,50,50,100,100,100,...,50,100,50,50,50,50,100,50,50,50
Subj_5,100,100,100,100,100,50,50,100,50,100,...,100,100,100,100,50,50,50,50,50,50


In [16]:
losses100 = loss100
losses100 = losses100.set_axis(columnnames100, axis=1)
losses100.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial91,Trial92,Trial93,Trial94,Trial95,Trial96,Trial97,Trial98,Trial99,Trial100
Subj_1,-200,-150,0,-250,0,0,0,0,0,0,...,0,0,-350,0,0,0,0,0,0,-1250
Subj_2,0,0,0,0,0,0,-50,0,0,0,...,0,0,0,-250,-1250,0,0,0,0,0
Subj_3,0,0,-50,-300,0,-1250,0,0,0,-50,...,-50,0,-200,0,0,0,0,0,0,0
Subj_4,-250,-50,0,0,-200,0,0,0,0,-1250,...,0,0,0,0,-50,0,0,0,-50,0
Subj_5,0,0,0,-1250,0,-50,0,0,0,0,...,0,0,0,-1250,0,0,0,-50,0,0


In [17]:
df100_sum = wins100.add(losses100, fill_value=0)
df100_sum.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial91,Trial92,Trial93,Trial94,Trial95,Trial96,Trial97,Trial98,Trial99,Trial100
Subj_1,-100,-50,100,-200,50,100,100,100,50,100,...,100,100,-250,100,100,100,50,100,50,-1150
Subj_2,100,100,50,50,50,100,0,100,100,100,...,50,100,50,-200,-1150,50,50,100,100,50
Subj_3,50,100,0,-200,50,-1150,50,50,50,0,...,0,100,-100,50,100,100,100,50,100,100
Subj_4,-200,0,50,100,-100,50,50,100,100,-1150,...,50,100,50,50,0,50,100,50,0,50
Subj_5,100,100,100,-1150,100,0,50,100,50,100,...,100,100,100,-1150,50,50,50,0,50,50


In [18]:
profit100 = df100_sum.sum(axis=1)
profit100df = pd.DataFrame(data=profit100)
profit100df.rename(columns={0: 'Margin'}, inplace=True)
profit100df.head()

Unnamed: 0,Margin
Subj_1,-1800
Subj_2,-800
Subj_3,-450
Subj_4,1200
Subj_5,-1300


In [19]:
profit100df['Study'] = index100['Study'].values
profit100df

Unnamed: 0,Margin,Study
Subj_1,-1800,Horstmann
Subj_2,-800,Horstmann
Subj_3,-450,Horstmann
Subj_4,1200,Horstmann
Subj_5,-1300,Horstmann
...,...,...
Subj_500,75,Worthy
Subj_501,600,Worthy
Subj_502,-1525,Worthy
Subj_503,-750,Worthy


In [20]:
mode100 = choice100.mode(axis=1)
mode100.rename(columns={0: 'Most Common Choice'}, inplace=True)
profit100df['Most Common Choice'] = mode100['Most Common Choice'].values
profit100df.head()

Unnamed: 0,Margin,Study,Most Common Choice
Subj_1,-1800,Horstmann,2.0
Subj_2,-800,Horstmann,2.0
Subj_3,-450,Horstmann,2.0
Subj_4,1200,Horstmann,4.0
Subj_5,-1300,Horstmann,2.0


In [21]:
profit100df['Most Common Choice'].value_counts()

2.0    221
4.0    171
3.0     98
1.0     14
Name: Most Common Choice, dtype: int64

In [22]:
profit100df['Most Common Choice'] = profit100df['Most Common Choice'].astype('int64')
profit100df.head()

Unnamed: 0,Margin,Study,Most Common Choice
Subj_1,-1800,Horstmann,2
Subj_2,-800,Horstmann,2
Subj_3,-450,Horstmann,2
Subj_4,1200,Horstmann,4
Subj_5,-1300,Horstmann,2


In [23]:
mean100 = choice100.mean(axis=1)
mean100df = pd.DataFrame(data=mean100)
mean100df.rename(columns={0: 'Average Choice'}, inplace=True)
profit100df['Average Choice'] = mean100df['Average Choice'].values
profit100df.head()

Unnamed: 0,Margin,Study,Most Common Choice,Average Choice
Subj_1,-1800,Horstmann,2,2.38
Subj_2,-800,Horstmann,2,2.7
Subj_3,-450,Horstmann,2,2.46
Subj_4,1200,Horstmann,4,2.85
Subj_5,-1300,Horstmann,2,2.65


In [71]:
profit100df.to_csv('Data/cleaned100.csv')

Lastly, we take the 150 trial data.

In [24]:
columnnames150 = [f'Trial{num}' for num in range(1,151)]
wins150 = win150
wins150 = wins150.set_axis(columnnames150, axis=1)
wins150.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial141,Trial142,Trial143,Trial144,Trial145,Trial146,Trial147,Trial148,Trial149,Trial150
Subj_1,50,100,100,100,50,100,50,100,50,50,...,50,100,50,100,50,100,50,100,50,100
Subj_2,100,100,50,50,50,100,50,100,100,100,...,100,100,100,100,100,50,100,50,50,100
Subj_3,100,50,100,50,100,100,50,50,100,50,...,50,50,50,50,50,50,50,50,50,50
Subj_4,50,50,50,50,100,50,50,50,50,100,...,50,50,50,50,50,50,50,50,50,50
Subj_5,50,50,50,50,50,50,50,50,50,50,...,50,50,50,50,50,50,50,50,50,50


In [25]:
losses150 = loss150
losses150 = losses150.set_axis(columnnames150, axis=1)
losses150.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial141,Trial142,Trial143,Trial144,Trial145,Trial146,Trial147,Trial148,Trial149,Trial150
Subj_1,-250,0,-350,0,0,-200,0,0,0,0,...,0,-250,0,-1250,0,0,-50,0,0,-150
Subj_2,-250,-350,0,0,0,0,0,0,0,0,...,0,0,0,-1250,0,-250,-300,0,-50,0
Subj_3,0,0,0,0,-150,-1250,0,-50,-350,0,...,0,0,0,0,0,0,0,0,0,0
Subj_4,0,0,0,0,-150,0,0,-50,-250,0,...,0,0,0,0,-250,0,0,0,0,0
Subj_5,0,0,0,0,0,0,-250,0,0,0,...,0,0,0,0,0,0,0,0,0,-250


In [26]:
df150_sum = wins150.add(losses150, fill_value=0)
df150_sum.head()

Unnamed: 0,Trial1,Trial2,Trial3,Trial4,Trial5,Trial6,Trial7,Trial8,Trial9,Trial10,...,Trial141,Trial142,Trial143,Trial144,Trial145,Trial146,Trial147,Trial148,Trial149,Trial150
Subj_1,-200,100,-250,100,50,-100,50,100,50,50,...,50,-150,50,-1150,50,100,0,100,50,-50
Subj_2,-150,-250,50,50,50,100,50,100,100,100,...,100,100,100,-1150,100,-200,-200,50,0,100
Subj_3,100,50,100,50,-50,-1150,50,0,-250,50,...,50,50,50,50,50,50,50,50,50,50
Subj_4,50,50,50,50,-50,50,50,0,-200,100,...,50,50,50,50,-200,50,50,50,50,50
Subj_5,50,50,50,50,50,50,-200,50,50,50,...,50,50,50,50,50,50,50,50,50,-200


In [27]:
profit150 = df150_sum.sum(axis=1)
profit150df = pd.DataFrame(data=profit150)
profit150df.rename(columns={0: 'Margin'}, inplace=True)
profit150df.head()

Unnamed: 0,Margin
Subj_1,-550
Subj_2,-1600
Subj_3,900
Subj_4,2200
Subj_5,1900


In [28]:
profit150df['Study'] = index150['Study'].values
profit150df

Unnamed: 0,Margin,Study
Subj_1,-550,Steingroever2011
Subj_2,-1600,Steingroever2011
Subj_3,900,Steingroever2011
Subj_4,2200,Steingroever2011
Subj_5,1900,Steingroever2011
...,...,...
Subj_94,300,Wetzels
Subj_95,2150,Wetzels
Subj_96,1450,Wetzels
Subj_97,1200,Wetzels


In [29]:
mode150 = choice150.mode(axis=1)
mode150.rename(columns={0: 'Most Common Choice'}, inplace=True)
profit150df['Most Common Choice'] = mode150['Most Common Choice'].values

In [33]:
mean150 = choice150.mean(axis=1)
mean150df = pd.DataFrame(data=mean150)
mean150df.rename(columns={0: 'Average Choice'}, inplace=True)
profit150df['Average Choice'] = mean150df['Average Choice'].values
profit150df.head()

Unnamed: 0,Margin,Study,Most Common Choice,Average Choice
Subj_1,-550,Steingroever2011,1,2.393333
Subj_2,-1600,Steingroever2011,2,2.313333
Subj_3,900,Steingroever2011,4,3.1
Subj_4,2200,Steingroever2011,4,3.426667
Subj_5,1900,Steingroever2011,4,3.72


In [69]:
profit150df.to_csv('Data/cleaned150.csv')

In [35]:
merged95_150 = pd.concat([profit95df, profit150df])
merged95_150.head(25)

Unnamed: 0,Margin,Study,Average Choice,Most Common Choice
Subj_1,1150,Fridberg,3.4,4
Subj_2,-675,Fridberg,2.568421,4
Subj_3,-750,Fridberg,2.778947,4
Subj_4,-525,Fridberg,2.810526,4
Subj_5,100,Fridberg,3.021053,4
Subj_6,1250,Fridberg,3.221053,4
Subj_7,-150,Fridberg,2.663158,4
Subj_8,150,Fridberg,2.926316,4
Subj_9,-575,Fridberg,2.842105,4
Subj_10,1475,Fridberg,3.357895,4


In [36]:
mergedall = pd.concat([merged95_150, profit100df])
mergedall

Unnamed: 0,Margin,Study,Average Choice,Most Common Choice
Subj_1,1150,Fridberg,3.400000,4
Subj_2,-675,Fridberg,2.568421,4
Subj_3,-750,Fridberg,2.778947,4
Subj_4,-525,Fridberg,2.810526,4
Subj_5,100,Fridberg,3.021053,4
...,...,...,...,...
Subj_500,75,Worthy,2.630000,2
Subj_501,600,Worthy,2.840000,3
Subj_502,-1525,Worthy,2.380000,2
Subj_503,-750,Worthy,2.460000,1


In [40]:
replacements_study = {
  r'Fridberg': 0,  
  r'Horstmann': 1,
  r'Kjome': 2,
  r'Maia': 3,
  r'SteingroverInPrep': 4,
  r'Premkumar': 5,
  r'Wood': 6,
  r'Worthy': 7,
  r'Steingroever2011': 8,
  r'Wetzels': 9,  
}

mergedall['StudyNumber'] = mergedall.Study.replace(replacements_study, regex=True)
mergedall = mergedall.drop(columns=['Study'])
mergedall

Unnamed: 0,Margin,Average Choice,Most Common Choice,StudyNumber
Subj_1,1150,3.400000,4,0
Subj_2,-675,2.568421,4,0
Subj_3,-750,2.778947,4,0
Subj_4,-525,2.810526,4,0
Subj_5,100,3.021053,4,0
...,...,...,...,...
Subj_500,75,2.630000,2,7
Subj_501,600,2.840000,3,7
Subj_502,-1525,2.380000,2,7
Subj_503,-750,2.460000,1,7


In [41]:
mergedall.to_csv('Data/cleaned_all.csv')