In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()  # for plot styling
pd.options.display.max_columns = 99

In [2]:
csv_file = 'Data/result.csv'
# csv_file = 'Data/Exam 1_ Part 2 Quiz Student Analysis Report.csv'
sectionID = 1453
max_students_per_group = 20
num_groups = 3
Homogenous = True

In [3]:
def calc_group_sizes(num_students, num_groups):
    '''
    Parameters
    -----------
    num_students : int
        Number of students in the class
    num_groups : int
        Number of groups to break students into
    
    Returns
    ---------
    group_size : List of ideal group sizes
    '''
    group_sizes = []

    class_size = num_students
    group_num_count = num_groups
    group_num = num_groups

    for i in range(group_num_count):
        temp = class_size // group_num
        class_size -= temp
        group_num -= 1
        group_sizes.append(temp)

    return group_sizes

In [9]:
def clean_file(fileName,sectionID):
    '''
    Clean CSV file
    --------------------
    
    Parameters
    -----------
    .csv file : 
    sectionID : Class/Period Number to Group
    
    Returns
    ---------
    Pandas DataFrame (Cleaned)
    '''
    df = pd.read_csv(fileName, encoding='latin-1')
    df.set_index(keys=df['name'],inplace=True)
    df = df.select_dtypes(exclude=['object','bool'])
    
    df.drop(columns=['id','section_sis_id','attempt'],inplace=True)
    
    class_df = df[df['section_id']==sectionID]

    return class_df

In [5]:
def normalize_df(df):
    '''
    Normalize DataFrame Values from 0-1
    
    Parameters
    ----------
    df : DataFrame to Normalize
    
    Returns
    -------
    Normalized DataFrame
    '''
    return df.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) if np.min(x) != np.max(x) else x)
    

In [6]:
def add_clusters(df, num_groups=6):
    '''
    Add Clusters
    '''
    kmeans = KMeans(num_groups)
    kmeans.fit(df)
    cluster = kmeans.predict(df)
    df['Cluster'] = cluster
    return df

In [8]:
# def make_barplot(df, x_val = "Cluster",y_val="score"):
#     plot = sns.barplot(x = x_val, y = y_val, data = df);
#     plot.axes.set_title("Mean Score of Students per Cluster",fontsize=20,weight="bold")
#     plot.axes.set_xlabel("Cluster",fontsize=20,weight="bold")
#     plot.axes.set_ylabel("Mean Score",fontsize=20,weight="bold")
#     plot.tick_params(labelsize=15)
#     # plot.figsize=(12,12)

In [9]:
# def make_countplot(df, x_val = 'Cluster'):
#     plot = sns.countplot(x = x_val, data = df);
#     plot.axes.set_title("Frequency of Students per Cluster",fontsize=20,weight="bold")
#     plot.axes.set_xlabel("Cluster",fontsize=20,weight="bold")
#     plot.axes.set_ylabel("Frequency",fontsize=20,weight="bold")
#     plot.tick_params(labelsize=15)

In [10]:
student_df = clean_file(csv_file,sectionID)

In [11]:
student_df = normalize_df(student_df)

In [13]:
student_df = add_clusters(student_df,3)

In [17]:
def return_cluster_list(df,num_groups=6):
    cluster_list = []
    
    for i in range(num_groups):
        cluster_list.append(list(student_df[student_df['Cluster']==i].index))
        
    return cluster_list
        

In [19]:
return_cluster_list(student_df,3)

[['Thing', 'Doctor Doom', 'Mister Fantastic'],
 ['Iron Man',
  'Deadpool',
  'Rocket Raccoon',
  'Wolverine',
  'Steel',
  'Batman',
  'Groot',
  'Superman',
  'Susan Storm',
  'Nightcrawler',
  'Green Arrow',
  'Cyclops',
  'Catwoman',
  'Wonder Woman',
  'Star-Lord'],
 ['Swamp Thing', 'Mystique', 'Superwoman', 'Elektra']]

In [18]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  4


In [556]:
index_list = list(student_df.index)



for i in range(10):
    randomized_index_list = np.random.choice(index_list, size = len(index_list),replace=False)
    group_set = set({})
    index_track = 0
    for num in size_list:
        
        j = frozenset(randomized_index_list[0 + index_track:index_track+num])
        group_set.add(j)
        index_track += num
    
    print('\n   Group Trial:', i)
    for group in group_set:
        unfrozen = set(group)
        print(unfrozen)
#         print(unfrozen)
    
    


    


   Group Trial: 0
{'Jacqueline Barajas', 'Briana Franco', 'Jessie Lorenzo'}
{'Janet Sanchez', 'Esther Lopez', 'Clarissa Dimas'}
{'Alejandra Benavides Fuentes', 'Monserrat Garcia', 'Abril Rodriguez', 'Sara Madriz'}
{'Kelly Argueta', 'Noemi Franco', 'Stephanie Abarca-Lopez', 'Prab Kaur Chhina'}
{'Ruben Jr. Villalobos-Ulloa', 'Johnathan Conrique', 'Ricardo Perez', 'Iran Valladares-Fuentes'}
{'Ricardo Marquez', 'Edgar Cabrera', 'Eric Medrano', 'Alondra Romero'}

   Group Trial: 1
{'Janet Sanchez', 'Kelly Argueta', 'Monserrat Garcia', 'Sara Madriz'}
{'Abril Rodriguez', 'Ruben Jr. Villalobos-Ulloa', 'Iran Valladares-Fuentes'}
{'Ricardo Marquez', 'Ricardo Perez', 'Eric Medrano', 'Jessie Lorenzo'}
{'Alejandra Benavides Fuentes', 'Noemi Franco', 'Edgar Cabrera', 'Stephanie Abarca-Lopez'}
{'Johnathan Conrique', 'Esther Lopez', 'Prab Kaur Chhina'}
{'Clarissa Dimas', 'Jacqueline Barajas', 'Briana Franco', 'Alondra Romero'}

   Group Trial: 2
{'Alejandra Benavides Fuentes', 'Kelly Argueta', 'Jessi

In [535]:
j = set(randomized_index_list[0:3])

In [536]:
j

{'Briana Franco', 'Kelly Argueta', 'Ruben Jr. Villalobos-Ulloa'}

In [528]:
index_list

['Ruben Jr. Villalobos-Ulloa',
 'Iran Valladares-Fuentes',
 'Janet Sanchez',
 'Abril Rodriguez',
 'Ricardo Perez',
 'Esther Lopez',
 'Clarissa Dimas',
 'Johnathan Conrique',
 'Kelly Argueta',
 'Jacqueline Barajas',
 'Eric Medrano',
 'Stephanie Abarca-Lopez',
 'Alondra Romero',
 'Ricardo Marquez',
 'Alejandra Benavides Fuentes',
 'Edgar Cabrera',
 'Monserrat Garcia',
 'Noemi Franco',
 'Sara Madriz',
 'Jessie Lorenzo',
 'Prab Kaur Chhina',
 'Briana Franco']

In [534]:
randomized_index_list

array(['Briana Franco', 'Kelly Argueta', 'Ruben Jr. Villalobos-Ulloa',
       'Stephanie Abarca-Lopez', 'Janet Sanchez', 'Clarissa Dimas',
       'Edgar Cabrera', 'Alondra Romero', 'Johnathan Conrique',
       'Prab Kaur Chhina', 'Monserrat Garcia', 'Noemi Franco',
       'Ricardo Perez', 'Ricardo Marquez', 'Iran Valladares-Fuentes',
       'Jessie Lorenzo', 'Jacqueline Barajas', 'Abril Rodriguez',
       'Eric Medrano', 'Alejandra Benavides Fuentes', 'Sara Madriz',
       'Esther Lopez'], dtype='<U27')

In [523]:
# from itertools import combinations

# size_of_groups = 3

# for index in list(combinations(df.index,size_of_groups)):

#     print(df.loc[index,:]['score'])
#     loss = 0
    
#     avg_score = np.mean(df.loc[index,:]['score'])
#     for i in range(size_of_groups):
#         loss += (df.loc[index,:]['score'][i] - avg_score) ** 2
#     print("Loss:",loss)
    
#     print('\n')
    
    

In [483]:
team_list = []

df = student_df.copy()
for i in range(8):
    x = z.sample(n = 3) 

    team_list.append(x.index)


In [482]:
team_list

[Index(['Janet Sanchez', 'Iran Valladares-Fuentes', 'Monserrat Garcia'], dtype='object', name='name'),
 Index(['Janet Sanchez', 'Johnathan Conrique', 'Monserrat Garcia'], dtype='object', name='name'),
 Index(['Abril Rodriguez', 'Stephanie Abarca-Lopez', 'Clarissa Dimas'], dtype='object', name='name'),
 Index(['Stephanie Abarca-Lopez', 'Janet Sanchez', 'Kelly Argueta'], dtype='object', name='name'),
 Index(['Monserrat Garcia', 'Iran Valladares-Fuentes', 'Briana Franco'], dtype='object', name='name'),
 Index(['Iran Valladares-Fuentes', 'Edgar Cabrera', 'Briana Franco'], dtype='object', name='name'),
 Index(['Janet Sanchez', 'Clarissa Dimas', 'Briana Franco'], dtype='object', name='name'),
 Index(['Noemi Franco', 'Monserrat Garcia', 'Iran Valladares-Fuentes'], dtype='object', name='name')]

In [402]:
# student_df_clustered = add_clusters(student_df,num_clusters)
# student_df_clustered.head(5)

In [403]:
# SSE = []
# for cluster in range(1,20):
#     kmeans = KMeans(n_jobs = -1, n_clusters = cluster, init='k-means++')
#     kmeans.fit(student_df)
#     SSE.append(kmeans.inertia_)

# # converting the results into a dataframe and plotting them
# frame = pd.DataFrame({'Cluster':range(1,20), 'SSE':SSE})
# plt.figure(figsize=(12,6))
# plt.plot(frame['Cluster'], frame['SSE'], marker='o')
# plt.xlabel('Number of clusters')
# plt.ylabel('Inertia')