In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.optimize as scp
import os
import glob

In [49]:
df = pd.read_csv('virtualInternshipData_ADS2001_utf8.csv')
df = df.drop('Unnamed: 0', axis = 1)
df

Unnamed: 0,userIDs,implementation,Line_ID,ChatGroup,content,group_id,RoleName,roomName,m_experimental_testing,m_making_design_choices,m_asking_questions,j_customer_consultants_requests,j_performance_parameters_requirements,j_communication,OutcomeScore,wordCount
0,1,a,1,PRNLT,Hello team. Welcome to Nephrotex!,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,5
1,1,a,2,PRNLT,I'm Maria Williams. I'll be your design adviso...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,11
2,1,a,3,PRNLT,I'm here to help if you have any questions.,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,9
3,1,a,4,PRNLT,Please introduce yourselves with the name you ...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,1,0,0,4,51
4,1,a,5,PRNLT,I just want to make sure everyone has found th...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19175,392,o,19179,PESPVP,yes,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,5,1
19176,388,o,19180,PESPVP,sounds good,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,8,2
19177,367,o,19181,PESPVP,"Well, we are out of time for our meeting.",6,Mentor,Reflection team discussion of first batch results,0,0,0,0,0,0,4,9
19178,393,o,19182,PESPVP,Precisely,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,4,1


In [50]:
df['userIDs'] = df['userIDs'].astype(str)
df['Line_ID'] = df['Line_ID'].astype(str)
df['unique_id'] = df[('userIDs')] + df['implementation'] + df['ChatGroup']
df = df[['unique_id', 'userIDs', 'implementation', 'Line_ID', 'ChatGroup', 'content', 'group_id', 'RoleName', 'roomName', 'm_experimental_testing', 'm_making_design_choices', 'm_asking_questions', 'j_customer_consultants_requests', 'j_performance_parameters_requirements', 'j_communication', 'OutcomeScore', 'wordCount']]

In [51]:
df

Unnamed: 0,unique_id,userIDs,implementation,Line_ID,ChatGroup,content,group_id,RoleName,roomName,m_experimental_testing,m_making_design_choices,m_asking_questions,j_customer_consultants_requests,j_performance_parameters_requirements,j_communication,OutcomeScore,wordCount
0,1aPRNLT,1,a,1,PRNLT,Hello team. Welcome to Nephrotex!,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,5
1,1aPRNLT,1,a,2,PRNLT,I'm Maria Williams. I'll be your design adviso...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,11
2,1aPRNLT,1,a,3,PRNLT,I'm here to help if you have any questions.,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,9
3,1aPRNLT,1,a,4,PRNLT,Please introduce yourselves with the name you ...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,1,0,0,4,51
4,1aPRNLT,1,a,5,PRNLT,I just want to make sure everyone has found th...,2,Mentor,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19175,392oPESPVP,392,o,19179,PESPVP,yes,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,5,1
19176,388oPESPVP,388,o,19180,PESPVP,sounds good,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,8,2
19177,367oPESPVP,367,o,19181,PESPVP,"Well, we are out of time for our meeting.",6,Mentor,Reflection team discussion of first batch results,0,0,0,0,0,0,4,9
19178,393oPESPVP,393,o,19182,PESPVP,Precisely,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,4,1


### Checking for missing values

In [52]:
df.isna().sum() # check for missing values

unique_id                                0
userIDs                                  0
implementation                           0
Line_ID                                  0
ChatGroup                                0
content                                  0
group_id                                 0
RoleName                                 3
roomName                                 0
m_experimental_testing                   0
m_making_design_choices                  0
m_asking_questions                       0
j_customer_consultants_requests          0
j_performance_parameters_requirements    0
j_communication                          0
OutcomeScore                             0
wordCount                                0
dtype: int64

In [53]:
df = df.dropna() # dropping rows with missing values as it only makes up for 3/19180 rows

In [54]:
df.isna().sum()

unique_id                                0
userIDs                                  0
implementation                           0
Line_ID                                  0
ChatGroup                                0
content                                  0
group_id                                 0
RoleName                                 0
roomName                                 0
m_experimental_testing                   0
m_making_design_choices                  0
m_asking_questions                       0
j_customer_consultants_requests          0
j_performance_parameters_requirements    0
j_communication                          0
OutcomeScore                             0
wordCount                                0
dtype: int64

# Data analysis

In [55]:
# data frame for each chat topic
# includes messages which have a combination of topics
# explore relationship between number of messages per topic to score
experimental_testing = df[df['m_experimental_testing'] == 1]
design_choices = df[df['m_making_design_choices'] == 1]  
asking_questions = df[df['m_asking_questions'] == 1]
customer_consultant_requests = df[df['j_customer_consultants_requests'] == 1]
performance_parameters_requirements = df[df['j_performance_parameters_requirements'] == 1]

### Data shape

In [56]:
df.shape

(19177, 17)

### Number of groups

In [57]:
df['group_id'].unique()

array([2, 3, 4, 5, 6], dtype=int64)

### Number of participants

In [58]:
len(df['userIDs'].unique())

392

### Number of mentors

In [59]:
mentors_only = df[df['RoleName'] == 'Mentor']
print(f"There are {len(mentors_only['userIDs'].unique())} mentors")
mentors_only['userIDs'].unique()

There are 23 mentors


array(['1', '22', '37', '52', '63', '90', '107', '117', '134', '144',
       '150', '173', '201', '227', '254', '278', '301', '328', '333',
       '338', '343', '349', '367'], dtype=object)

### Creating student only dataframe

In [60]:
students_only = df[df['RoleName'] == 'Player']

In [61]:
students_only

Unnamed: 0,unique_id,userIDs,implementation,Line_ID,ChatGroup,content,group_id,RoleName,roomName,m_experimental_testing,m_making_design_choices,m_asking_questions,j_customer_consultants_requests,j_performance_parameters_requirements,j_communication,OutcomeScore,wordCount
5,2aPRNLT,2,a,6,PRNLT,Hello I am Brandon!,2,Player,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,4
6,3aPRNLT,3,a,7,PRNLT,I am Zelin,2,Player,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,3
7,3aPRNLT,3,a,8,PRNLT,Hi,2,Player,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,1
8,4aPRNLT,4,a,9,PRNLT,i am jack,2,Player,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,4,3
9,5aPRNLT,5,a,10,PRNLT,Hey! I'm Rachel!,2,Player,Introduction and Workflow Tutorial with Entran...,0,0,0,0,0,0,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19173,391oPESPVP,391,o,19177,PESPVP,exactly!,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,5,1
19174,389oPESPVP,389,o,19178,PESPVP,sounds good,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,7,2
19175,392oPESPVP,392,o,19179,PESPVP,yes,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,5,1
19176,388oPESPVP,388,o,19180,PESPVP,sounds good,6,Player,Reflection team discussion of first batch results,0,0,0,0,0,0,8,2


### Create data frame for each group

In [62]:
group_2 = students_only[students_only['group_id'] == 2]
group_3 = students_only[students_only['group_id'] == 3]
group_4 = students_only[students_only['group_id'] == 4]
group_5 = students_only[students_only['group_id'] == 5]
group_6 = students_only[students_only['group_id'] == 6]

In [63]:
print(f"There are {len(group_2['userIDs'].unique())} students in group 2")
print(f"There are {len(group_3['userIDs'].unique())} students in group 3")
print(f"There are {len(group_4['userIDs'].unique())} students in group 4")
print(f"There are {len(group_5['userIDs'].unique())} students in group 5")
print(f"There are {len(group_6['userIDs'].unique())} students in group 6")


There are 78 students in group 2
There are 73 students in group 3
There are 75 students in group 4
There are 72 students in group 5
There are 71 students in group 6


### Average score of each group

In [64]:
average_scores = pd.DataFrame()
average_scores['group'] = students_only['group_id'].unique()

outcome_scores = students_only.groupby('group_id')['OutcomeScore'].mean().reset_index()

average_scores = average_scores.merge(outcome_scores, how='left', left_on='group', right_on='group_id')

In [65]:
average_scores = average_scores.drop(columns = ['group_id'])
average_scores

Unnamed: 0,group,OutcomeScore
0,2,3.410944
1,3,3.672073
2,4,3.781714
3,5,4.087102
4,6,3.636873


### Table of scores of students

In [66]:
student_scores = pd.DataFrame()
student_scores['student'] = students_only['userIDs'].unique()

outcome_scores = students_only.groupby('userIDs')['OutcomeScore'].mean().reset_index()

student_scores = student_scores.merge(outcome_scores, how = 'left', left_on = 'student', right_on = 'userIDs')
student_scores

Unnamed: 0,student,userIDs,OutcomeScore
0,2,2,4.0
1,3,3,4.0
2,4,4,4.0
3,5,5,2.0
4,6,6,2.0
...,...,...,...
364,389,389,7.0
365,390,390,4.0
366,391,391,5.0
367,392,392,5.0


# Current Data frames we have
1. students_only - data frame with only student data
2. mentors_only - data frame with only mentor data
3. experimental_testing 
4. design_choices 
5. asking_questions
6. customer_consultant_requests 
7. performance_parameters_requirements 

# Creating GROUP LEVEL DATA (USE THIS FOR ANALYSIS)

In [67]:
group_level_data = students_only.groupby('group_id').agg(
mean_outcome_score = ('OutcomeScore', 'mean'),
median_outcome_score = ("OutcomeScore", 'median'),
grp_total_experimental_testing = ("m_experimental_testing", 'sum'),
grp_total_making_design_choices = ("m_making_design_choices", 'sum'),
grp_total_asking_questions = ("m_asking_questions", 'sum'),
grp_total_customer_consultants_requests = ("j_customer_consultants_requests", 'sum'),
grp_total_performance_parameters_requirements = ("j_performance_parameters_requirements", 'sum'))
group_level_data['total_messages'] = group_level_data['grp_total_experimental_testing'] + group_level_data['grp_total_making_design_choices'] +group_level_data['grp_total_asking_questions'] +group_level_data['grp_total_customer_consultants_requests'] + group_level_data['grp_total_performance_parameters_requirements']

group_score_range = students_only.groupby('group_id')['OutcomeScore'].apply(lambda x: x.max() - x.min())
group_level_data['score_range'] = group_level_data.index.map(group_score_range)

In [68]:
mentor_interactions = mentors_only.groupby('group_id').agg(average_experimental_testing = ("m_experimental_testing", 'sum'),
    average_making_design_choices = ("m_making_design_choices", 'sum'),
    average_asking_questions = ("m_asking_questions", 'sum'),
    average_customer_consultants_requests = ("j_customer_consultants_requests", 'sum'),
    average_performance_parameters_requirements = ("j_performance_parameters_requirements", 'sum'))
mentor_interactions['total_mentor_interactions'] = mentor_interactions['average_experimental_testing'] + mentor_interactions['average_making_design_choices'] + mentor_interactions['average_asking_questions'] + mentor_interactions['average_customer_consultants_requests'] + mentor_interactions['average_performance_parameters_requirements']

In [69]:
group_level_data['total_mentor_interactions'] = mentor_interactions['total_mentor_interactions']
group_level_data['grp_score_range'] = group_level_data['mean_outcome_score']

group_level_data

Unnamed: 0_level_0,mean_outcome_score,median_outcome_score,grp_total_experimental_testing,grp_total_making_design_choices,grp_total_asking_questions,grp_total_customer_consultants_requests,grp_total_performance_parameters_requirements,total_messages,score_range,total_mentor_interactions,grp_score_range
group_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,3.410944,3.0,102,349,617,63,202,1333,7,211,3.410944
3,3.672073,4.0,61,333,601,86,173,1254,8,209,3.672073
4,3.781714,4.0,154,357,644,56,187,1398,7,234,3.781714
5,4.087102,4.0,111,356,519,54,160,1200,7,231,4.087102
6,3.636873,4.0,91,314,535,44,179,1163,8,228,3.636873


# Creating STUDENT LEVEL DATA (USE THIS FOR ANALYSIS)

In [70]:
groups = students_only.groupby('userIDs')['group_id'].mean().reset_index()
groups

Unnamed: 0,userIDs,group_id
0,10,3.0
1,100,3.0
2,101,4.0
3,102,4.0
4,103,4.0
...,...,...
364,95,2.0
365,96,3.0
366,97,3.0
367,98,3.0


In [71]:
student_level_data = students_only.groupby('userIDs').agg(
outcome_score = ('OutcomeScore', 'mean'),
total_experimental_testing = ("m_experimental_testing", 'sum'),
total_making_design_choices = ("m_making_design_choices", 'sum'),
total_asking_questions = ("m_asking_questions", 'sum'),
total_customer_consultants_requests = ("j_customer_consultants_requests", 'sum'),
total_performance_parameters_requirements = ("j_performance_parameters_requirements", 'sum'))

student_level_data = student_level_data.merge(groups, how = 'left', left_on = 'userIDs', right_on = 'userIDs')

student_level_data.set_index('userIDs', inplace = True)
student_level_data.index = student_level_data.index.astype(int)
student_level_data = student_level_data.sort_index()
student_level_data['total_messages'] = student_level_data['total_experimental_testing'] + student_level_data['total_making_design_choices'] +student_level_data['total_asking_questions'] +student_level_data['total_customer_consultants_requests'] + student_level_data['total_performance_parameters_requirements']

student_level_data['group_id'] = student_level_data['group_id'].round(0)
student_level_data['group_id'] = student_level_data['group_id'].astype(int)

merged_data = pd.merge(student_level_data, group_level_data, on='group_id', how='left')
merged_data = merged_data.drop(columns = merged_data[['mean_outcome_score', 'median_outcome_score', 'score_range']])
merged_data.rename(columns={'total_messages_y': 'total_grp_messages', 'total_messages_x': 'total_indiv_messages'}, inplace=True)
merged_data

Unnamed: 0,outcome_score,total_experimental_testing,total_making_design_choices,total_asking_questions,total_customer_consultants_requests,total_performance_parameters_requirements,group_id,total_indiv_messages,grp_total_experimental_testing,grp_total_making_design_choices,grp_total_asking_questions,grp_total_customer_consultants_requests,grp_total_performance_parameters_requirements,total_grp_messages,total_mentor_interactions,grp_score_range
0,4.0,2,4,17,0,4,2,27,102,349,617,63,202,1333,211,3.410944
1,4.0,1,4,3,0,1,2,9,102,349,617,63,202,1333,211,3.410944
2,4.0,2,2,3,1,5,2,13,102,349,617,63,202,1333,211,3.410944
3,2.0,0,0,6,0,2,2,8,102,349,617,63,202,1333,211,3.410944
4,2.0,0,2,7,1,3,2,13,102,349,617,63,202,1333,211,3.410944
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364,7.0,2,8,4,1,7,6,22,91,314,535,44,179,1163,228,3.636873
365,4.0,0,5,6,1,2,6,14,91,314,535,44,179,1163,228,3.636873
366,5.0,2,3,11,1,4,6,21,91,314,535,44,179,1163,228,3.636873
367,5.0,2,4,4,0,2,6,12,91,314,535,44,179,1163,228,3.636873


# SVM

Group Level

In [None]:
# Group 2
g2 = group_2.drop(columns=['unique_id', 'userIDs','implementation','Line_ID', 
                      'ChatGroup', 'content', 'group_id', 'RoleName', 'roomName'])

In [86]:
X = g2.drop(columns=['OutcomeScore',])  # Features
Y = g2['OutcomeScore']  # Target variable

In [87]:
from sklearn.model_selection import train_test_split # import the splitting method from sklearn

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, 
    test_size=0.5, 
    random_state=0
) 

In [None]:
from sklearn.svm import SVC # import the SVM classifier from sklearn

model = SVC(kernel='linear', C=1.E10) # instantatiate the model with a linear kernel and hard boundaries
model.fit(X_train, Y_train); # fit our data to the model