In [73]:
import pandas as pd
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer() 
ps = PorterStemmer()

university_data = pd.read_excel('Labeled Courses Orig.xlsx', header=0)
university_data

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster,Minor Cluster Name
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",15,Algorithms
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",15,Algorithms
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",15,Algorithms
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",15,Algorithms
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",17,Algorithms
5,CS 430,Introduction to Algorithms,"Introduction to the design, behavior, and anal...",Illinois Institute of Tech,"Chicago, Illinois",17,Algorithms
6,CS 5800,Algorithms,Presents the mathematical techniques used for ...,Northeastern,"Boston, Massachusetts",17,Algorithms
7,CME 309,Randomized Algorithms and Probabilistic Analysis,Randomness pervades the natural processes arou...,Stanford,"Stanford, California",17,Algorithms
8,COMP 160,Algorithms,Introduction to the study of algorithms. Strat...,Tufts,"Medford, Massachusetts",17,Algorithms
9,CSE 633,Parallel Algorithms,"The course will focus on the design, implement...",University at Buffalo,"Buffalo, New York",17,Algorithms


In [74]:
remove_words = ['data', 'science', 'analysis', 'introduction']
university_data['Course Title Stripped'] = university_data['Course Title'].apply(lambda x: x.lower())
university_data['Course Title Stripped'] = university_data['Course Title Stripped'].apply(lambda x: nltk.tokenize.word_tokenize(x))
university_data['Course Title Stripped'] = university_data['Course Title Stripped'].apply(lambda x: [t for t in x if t not in remove_words])
university_data['Course Title Stripped'] = university_data['Course Title Stripped'].apply(lambda x: nltk.pos_tag(x))
university_data['Course Title Stripped'] = university_data['Course Title Stripped'].apply(lambda x: [t for t, pos in x if pos == 'NN' or pos == 'NNS' or pos == 'JJ'])
university_data['Course Title Stripped'] = university_data['Course Title Stripped'].apply(lambda x: [lemmatizer.lemmatize(t) for t in x])
university_data['Course Title Stripped'] = university_data['Course Title Stripped'].apply(', '.join)

In [75]:
university_data.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster,Minor Cluster Name,Course Title Stripped
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",15,Algorithms,"design, algorithm"
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",15,Algorithms,"design, algorithm"
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",15,Algorithms,"large-scale, social, complex, network, design,..."
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",15,Algorithms,"design, efficient, algorithm"
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",17,Algorithms,algorithm


In [76]:
# Vectorizer

vectorizer = TfidfVectorizer(stop_words='english', lowercase=False)
X = vectorizer.fit_transform(list(university_data['Course Title Stripped']))
terms = vectorizer.get_feature_names()

# Kmeans Model
model = KMeans(n_clusters=12, max_iter=20000, random_state=2) 
model.fit(X)
clusters = model.labels_.tolist()

# Model save
joblib.dump(model, 'desc_cluster_nouns_title.pkl')

['desc_cluster_nouns_title.pkl']

In [77]:
# Get Model
model = joblib.load('desc_cluster_nouns_title.pkl')
clusters = model.labels_.tolist()

university_data['Cluster'] = clusters
university_data = university_data.reset_index(drop=True)

In [78]:
university_data

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster,Minor Cluster Name,Course Title Stripped,Cluster
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",15,Algorithms,"design, algorithm",9
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",15,Algorithms,"design, algorithm",9
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",15,Algorithms,"large-scale, social, complex, network, design,...",9
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",15,Algorithms,"design, efficient, algorithm",9
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",17,Algorithms,algorithm,9
5,CS 430,Introduction to Algorithms,"Introduction to the design, behavior, and anal...",Illinois Institute of Tech,"Chicago, Illinois",17,Algorithms,,11
6,CS 5800,Algorithms,Presents the mathematical techniques used for ...,Northeastern,"Boston, Massachusetts",17,Algorithms,algorithm,9
7,CME 309,Randomized Algorithms and Probabilistic Analysis,Randomness pervades the natural processes arou...,Stanford,"Stanford, California",17,Algorithms,"algorithm, probabilistic",9
8,COMP 160,Algorithms,Introduction to the study of algorithms. Strat...,Tufts,"Medford, Massachusetts",17,Algorithms,algorithm,9
9,CSE 633,Parallel Algorithms,"The course will focus on the design, implement...",University at Buffalo,"Buffalo, New York",17,Algorithms,"parallel, algorithm",9


In [79]:
university_data = university_data.drop(['Minor Cluster', 'Minor Cluster Name'], axis=1)
university_data['Minor Cluster'] = university_data['Cluster']
university_data = university_data.drop(['Cluster'], axis=1)
university_data['Minor Cluster Name'] = ''
university_data['Major Cluster'] = ''
university_data['Major Cluster Name'] = ''
university_data.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Course Title Stripped,Minor Cluster,Minor Cluster Name,Major Cluster,Major Cluster Name
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina","design, algorithm",9,,,
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois","design, algorithm",9,,,
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California","large-scale, social, complex, network, design,...",9,,,
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York","design, efficient, algorithm",9,,,
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",algorithm,9,,,


In [82]:
pd.set_option('display.max_rows', None)

for i in university_data['Minor Cluster'].unique():
    print('Minor Cluster:',i)
    print('Course Titles:', university_data.loc[university_data['Minor Cluster'] == i, 'Course Title Stripped'])
    print()

Minor Cluster: 9
Course Titles: 0                                      design, algorithm
1                                      design, algorithm
2      large-scale, social, complex, network, design,...
3                           design, efficient, algorithm
4                                              algorithm
6                                              algorithm
7                               algorithm, probabilistic
8                                              algorithm
9                                    parallel, algorithm
10                              approximation, algorithm
11                                             algorithm
12                                     design, algorithm
13                                     design, algorithm
14         cyber-physical, system, networking, algorithm
15                                   advanced, algorithm
16                      discrete, mathematics, algorithm
17                 large-scale, mining, model, algorithm

In [83]:
university_data.groupby('Minor Cluster').count()

Unnamed: 0_level_0,Course ID,Course Title,Course Description,University,Location,Course Title Stripped,Minor Cluster Name,Major Cluster,Major Cluster Name
Minor Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,50,50,50,50,50,50,50,50,50
1,46,46,46,46,46,46,46,46,46
2,51,51,51,51,51,51,51,51,51
3,33,33,33,33,33,33,33,33,33
4,20,20,20,20,20,20,20,20,20
5,36,36,36,36,36,36,36,36,36
6,48,48,48,48,48,48,48,48,48
7,40,40,40,40,40,40,40,40,40
8,20,20,20,20,20,20,20,20,20
9,37,37,37,37,37,37,37,37,37


In [90]:
# Random/Unknown Cluster Cleaning 

for i in university_data.index:
    #if university_data.iloc[i]['Minor Cluster']  == 2 or university_data.iloc[i]['Minor Cluster']  == 6 or university_data.iloc[i]['Minor Cluster']  == 3 or university_data.iloc[i]['Minor Cluster']  == 11 or university_data.iloc[i]['Minor Cluster']  == 8 or university_data.iloc[i]['Minor Cluster']  == 0:
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['algorithm', 'optimization']):
        university_data['Minor Cluster'].iloc[i] = 9 # Algorithms - Math 
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['visual','animation', 'graph', 'visualization', 'visualize']):
        university_data['Minor Cluster'].iloc[i] = 12 # Visualization
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['mining', 'munging', 'cleaning']):
        university_data['Minor Cluster'].iloc[i] = 3 # Data Mining
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['big', 'warehousing', 'management', 'database', 'storage', 'stores', 'acquisition', 'etl', 'integration', 'retrieval']):
        university_data['Minor Cluster'].iloc[i] = 5 # Data Warehousing            
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['algebra', 'matrix', 'math', 'calculus', 'derivative', 'differential', 'vector']):
        university_data['Minor Cluster'].iloc[i] = 13 # Theory - Math 
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['text','natural','linguistic', 'language', 'speech']):
        university_data['Minor Cluster'].iloc[i] = 14 # NLP - ML          
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['model', 'regression', 'multivariate', 'glm', 'linear']):
        university_data['Minor Cluster'].iloc[i] = 6 # Modeling - Stats
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['time', 'series', 'probability', 'random', 'stochastic', 'statistics']):
        university_data['Minor Cluster'].iloc[i] = 0 # Theory - Stats 
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['machine', 'deep', 'neural', 'learning']):
        university_data['Minor Cluster'].iloc[i] = 2 # Deep Learning - ML
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['os', 'operating', 'software', 'security', 'hardware', 'cybersecurity', 'internet', 'engineering', 'architecture']):
        university_data['Minor Cluster'].iloc[i] = 7 # Software - CS
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['parallel', 'distributed', 'cloud', 'computing', 'performance']):
        university_data['Minor Cluster'].iloc[i] = 4 # HPC - CS
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['capstone', 'practicum', 'seminar', 'study', 'research', 'seminar', 'internship']):
        university_data['Minor Cluster'].iloc[i] = 10 # Thesis - Topics
    if any(c in university_data['Course Title Stripped'].iloc[i] for c in ['society', 'media', 'social', 'politics', 'political', 'econ', 'economics', 'business', 'finance', 'marketing', 'public', 'policy', 'entrepreneur', 'global', 'environment', 'ethics', 'communication', 'health', 'medicine', 'humanities', 'survival']):
        university_data['Minor Cluster'].iloc[i] = 1 # Business - Topics  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [91]:
university_data.groupby('Minor Cluster').count()

Unnamed: 0_level_0,Course ID,Course Title,Course Description,University,Location,Course Title Stripped,Minor Cluster Name,Major Cluster,Major Cluster Name
Minor Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,82,82,82,82,82,82,82,82,82
1,121,121,121,121,121,121,121,121,121
2,89,89,89,89,89,89,89,89,89
3,26,26,26,26,26,26,26,26,26
4,30,30,30,30,30,30,30,30,30
5,84,84,84,84,84,84,84,84,84
6,93,93,93,93,93,93,93,93,93
7,123,123,123,123,123,123,123,123,123
9,41,41,41,41,41,41,41,41,41
10,60,60,60,60,60,60,60,60,60


In [92]:
pd.set_option('display.max_rows', None)

for i in university_data['Minor Cluster'].unique():
    print('Minor Cluster:',i)
    print('Course Titles:', university_data.loc[university_data['Minor Cluster'] == i, 'Course Title Stripped'])
    print()

Minor Cluster: 9
Course Titles: 0                                      design, algorithm
1                                      design, algorithm
3                           design, efficient, algorithm
4                                              algorithm
6                                              algorithm
7                               algorithm, probabilistic
8                                              algorithm
10                              approximation, algorithm
11                                             algorithm
12                                     design, algorithm
13                                     design, algorithm
14         cyber-physical, system, networking, algorithm
15                                   advanced, algorithm
19                                   advanced, algorithm
20                          design, efficient, algorithm
21         cyber-physical, system, networking, algorithm
22                                   advanced, algorithm

In [93]:
for i in university_data.index:
    if university_data.iloc[i]['Minor Cluster']  == 9:
        university_data['Minor Cluster Name'].iloc[i] = 'Algorithms'
        university_data['Major Cluster'].iloc[i] = 0
        university_data['Major Cluster Name'].iloc[i] = 'Math'
    if university_data.iloc[i]['Minor Cluster']  == 12:
        university_data['Minor Cluster Name'].iloc[i] = 'Visualization'
        university_data['Major Cluster'].iloc[i] = 7
        university_data['Major Cluster Name'].iloc[i] = 'Visualization'
    if university_data.iloc[i]['Minor Cluster']  == 3:
        university_data['Minor Cluster Name'].iloc[i] = 'Data Mining'
        university_data['Major Cluster'].iloc[i] = 4
        university_data['Major Cluster Name'].iloc[i] = 'Data Mining'
    if university_data.iloc[i]['Minor Cluster']  == 13:
        university_data['Minor Cluster Name'].iloc[i] = 'Theory'
        university_data['Major Cluster'].iloc[i] = 0
        university_data['Major Cluster Name'].iloc[i] = 'Math'
    if university_data.iloc[i]['Minor Cluster']  == 0:
        university_data['Minor Cluster Name'].iloc[i] = 'Other'
        university_data['Major Cluster'].iloc[i] = 1
        university_data['Major Cluster Name'].iloc[i] = 'Topics'
    if university_data.iloc[i]['Minor Cluster']  == 6:
        university_data['Minor Cluster Name'].iloc[i] = 'Modeling'
        university_data['Major Cluster'].iloc[i] = 2
        university_data['Major Cluster Name'].iloc[i] = 'Statistics'
    if university_data.iloc[i]['Minor Cluster']  == 2:
        university_data['Minor Cluster Name'].iloc[i] = 'Deep Learning'
        university_data['Major Cluster'].iloc[i] = 6
        university_data['Major Cluster Name'].iloc[i] = 'Machine Learning'
    if university_data.iloc[i]['Minor Cluster']  == 4:
        university_data['Minor Cluster Name'].iloc[i] = 'HPC'
        university_data['Major Cluster'].iloc[i] = 5
        university_data['Major Cluster Name'].iloc[i] = 'Computer Science'
    if university_data.iloc[i]['Minor Cluster']  == 5:
        university_data['Minor Cluster Name'].iloc[i] = 'Data Warehousing'
        university_data['Major Cluster'].iloc[i] = 3
        university_data['Major Cluster Name'].iloc[i] = 'Data Warehousing'
    if university_data.iloc[i]['Minor Cluster']  == 14:
        university_data['Minor Cluster Name'].iloc[i] = 'NLP'
        university_data['Major Cluster'].iloc[i] = 6
        university_data['Major Cluster Name'].iloc[i] = 'Machine Learning'
    if university_data.iloc[i]['Minor Cluster']  == 7:
        university_data['Minor Cluster Name'].iloc[i] = 'Software'
        university_data['Major Cluster'].iloc[i] = 5
        university_data['Major Cluster Name'].iloc[i] = 'Computer Science'
    if university_data.iloc[i]['Minor Cluster']  == 10:
        university_data['Minor Cluster Name'].iloc[i] = 'Thesis'
        university_data['Major Cluster'].iloc[i] = 1
        university_data['Major Cluster Name'].iloc[i] = 'Topics'
    if university_data.iloc[i]['Minor Cluster']  == 1:
        university_data['Minor Cluster Name'].iloc[i] = 'Business'
        university_data['Major Cluster'].iloc[i] = 1
        university_data['Major Cluster Name'].iloc[i] = 'Topics'
    if university_data.iloc[i]['Minor Cluster']  == 0:
        university_data['Minor Cluster Name'].iloc[i] = 'Theory'
        university_data['Major Cluster'].iloc[i] = 2
        university_data['Major Cluster Name'].iloc[i] = 'Statistics'
    if university_data.iloc[i]['Minor Cluster']  == 11:
        university_data['Minor Cluster Name'].iloc[i] = 'Other'
        university_data['Major Cluster'].iloc[i] = 1
        university_data['Major Cluster Name'].iloc[i] = 'Topics'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [94]:
university_data

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Course Title Stripped,Minor Cluster,Minor Cluster Name,Major Cluster,Major Cluster Name
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina","design, algorithm",9,Algorithms,0.0,Math
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois","design, algorithm",9,Algorithms,0.0,Math
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California","large-scale, social, complex, network, design,...",1,Business,1.0,Topics
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York","design, efficient, algorithm",9,Algorithms,0.0,Math
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",algorithm,9,Algorithms,0.0,Math
5,CS 430,Introduction to Algorithms,"Introduction to the design, behavior, and anal...",Illinois Institute of Tech,"Chicago, Illinois",,11,,,
6,CS 5800,Algorithms,Presents the mathematical techniques used for ...,Northeastern,"Boston, Massachusetts",algorithm,9,Algorithms,0.0,Math
7,CME 309,Randomized Algorithms and Probabilistic Analysis,Randomness pervades the natural processes arou...,Stanford,"Stanford, California","algorithm, probabilistic",9,Algorithms,0.0,Math
8,COMP 160,Algorithms,Introduction to the study of algorithms. Strat...,Tufts,"Medford, Massachusetts",algorithm,9,Algorithms,0.0,Math
9,CSE 633,Parallel Algorithms,"The course will focus on the design, implement...",University at Buffalo,"Buffalo, New York","parallel, algorithm",4,HPC,5.0,Computer Science


In [95]:
university_data.to_csv('university_clusters_NOUNS_Titles.csv')

## Course Description Nouns & Keywords

In [None]:
# Import 'university_clusters_NOUNS_Titles.csv'

In [96]:
university_data['Course Description Stripped'] = university_data['Course Description'].apply(lambda x: x.lower())
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(lambda x: nltk.tokenize.word_tokenize(x))
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(lambda x: nltk.pos_tag(x))
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(lambda x: [t for t, pos in x if pos == 'NN' or pos == 'NNS'])
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(lambda x: [lemmatizer.lemmatize(t) for t in x])
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(', '.join)

In [97]:
university_data.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Course Title Stripped,Minor Cluster,Minor Cluster Name,Major Cluster,Major Cluster Name,Course Description Stripped
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina","design, algorithm",9,Algorithms,0,Math,"design, analysis, algorithm, paradigm, applica..."
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois","design, algorithm",9,Algorithms,0,Math,"design, algorithm, variety, problem, proof, co..."
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California","large-scale, social, complex, network, design,...",1,Business,1,Topics,"modeling, design, network, network, network, w..."
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York","design, efficient, algorithm",9,Algorithms,0,Math,"design, program, efficiency, divide-and-conque..."
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",algorithm,9,Algorithms,0,Math,"course, level, introduction, design, analysis,..."


In [128]:
pd.set_option('display.max_rows', None)

for i in university_data['Minor Cluster'].unique():
    print('Minor Cluster:',i)
    print('Course Titles:', university_data.loc[university_data['Minor Cluster'] == i, 'Course Title'])
    print()

Minor Cluster: 9
Course Titles: 0                      Design and Analysis of Algorithms
1                      Design and Analysis of Algorithms
3              Design & Analysis of Efficient Algorithms
4                                             Algorithms
6                                             Algorithms
7       Randomized Algorithms and Probabilistic Analysis
8                                             Algorithms
10                              Approximation Algorithms
11                                Analysis of Algorithms
12                     Design and Analysis of Algorithms
13                     Design and Analysis of Algorithms
14     Cyber-Physical Systems: Networking and Algorithms
15                                   Advanced Algorithms
19                                   Advanced Algorithms
20             Design & Analysis of Efficient Algorithms
21     Cyber-Physical Systems: Networking and Algorithms
22                                   Advanced Algorithms

In [182]:
# Top 5 words per 'Minor Cluster'

pd.set_option('display.max_colwidth', -1)
top_n = university_data.groupby(['Minor Cluster']).agg(lambda x: x.value_counts().index[0])
top_n = top_n['Course Description Stripped'].apply(lambda x: x.split()[:5])
top_n

# for i in top_n['Course Description Stripped']:
#     print(top_n.loc[top_n['Course Description Stripped'] == i, 'Minor Cluster Name'] + '- ' + top_n.loc[top_n['Course Description Stripped'] == i, 'Major Cluster Name'])
#     print('Top 5 Words:',i.split()[:5])
#     print()

Minor Cluster
0     [comparison,, treatment,, random,, sampling,, randomization,]
1     [network,, tool,, system,, emergence,, network,]             
2     [introduce,, problem,, machine,, learning,, understanding,]  
3     [course,, topic,, data,, perspective,, attention,]           
4     [course,, tour,, research,, topic,, computing,]              
5     [course,, theory,, practice,, system,, information,]         
6     [introduction,, variety,, data,, modeling,, tool,]           
7     [course,, concept,, mechanism,, protocol,, data,]            
9     [design,, algorithm,, variety,, problem,, proof,]            
10    [student,, skill,, industry,, partner,, faculty,]            
11    [student,, foundation,, software,, design,, programming,]    
12    [fundamental,, cartography,, data,, structure,, information,]
13    [topic,, algebraic,, graph,, theory,, matroids,]             
14    [course,, concept,, method,, insight,, amount,]              
Name: Course Description Stripped,

In [168]:
university_data.to_csv('university_clusters_NOUNS_Titles_Desc.csv')

## Do not use 

In [236]:
copy = university_data.copy()
copy.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster,Minor Cluster Name,Cluster
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",15,Algorithms,9
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",15,Algorithms,6
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",15,Algorithms,2
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",15,Algorithms,9
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",17,Algorithms,16


In [237]:
# Merge repetitive clusters together 
copy.loc[copy['Cluster'] == 28, 'Cluster'] = 3 # Merge Machine Learning clusters
copy.loc[copy['Cluster'] == 14, 'Cluster'] = 10 # Merge Topics- Social Science/Business clusters
copy.loc[copy['Cluster'] == 21, 'Cluster'] = 16 # Merge Topics- other clusters
copy.loc[copy['Cluster'] == 15, 'Cluster'] = 1 # Stats- Methods
copy.loc[copy['Cluster'] == 3, 'Cluster'] = 28 # Deep Learning/ML
copy.loc[copy['Cluster'] == 29, 'Cluster'] = 0 # Visualization
copy.loc[copy['Cluster'] == 23, 'Cluster'] = 1 # Stats
copy.loc[copy['Cluster'] == 6, 'Cluster'] = 16 # Other

In [239]:
# Additional Cleaning
for i, row in copy.iterrows():
    if any(c in copy['Course Title'].iloc[i] for c in ['Acquisition', 'ETL', 'Database', 'Data Engineer', 'Integration', 'Retrieval']):
            copy['Cluster'].iloc[i] = 12 # Data Warehousing

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [240]:
for i in copy.Cluster.unique():
    print('Cluster:',i)
    print('Course Titles:', copy.loc[copy['Cluster'] == i, 'Course Title'])
    print()

Cluster: 9
Course Titles: 0                Design and Analysis of Algorithms
3        Design & Analysis of Efficient Algorithms
4                                       Algorithms
5                       Introduction to Algorithms
6                                       Algorithms
8                                       Algorithms
10                        Approximation Algorithms
11                          Analysis of Algorithms
12               Design and Analysis of Algorithms
15                             Advanced Algorithms
16             Discrete Mathematics and Algorithms
19                             Advanced Algorithms
20       Design & Analysis of Efficient Algorithms
22                             Advanced Algorithms
23             Discrete Mathematics and Algorithms
24                             Advanced Algorithms
25                  Data Structures and Algorithms
26                     Algorithms for Data Science
28                    Data Structrues & Algorithms
29   

In [201]:
#print(copy[copy["Cluster"]==8])
#print("Cluster 16 - Unknown:", copy[copy["Cluster"]==16].count()['Cluster'])

        Course ID                                       Course Title  \
163       CPE 691                       Information Systems Security   
166       CPE 691                       Information Systems Security   
244       INF 529                Security and Privacy in Informatics   
307   COMPSCI 677                  Distributed and Operating Systems   
371      ENVS-655       Environmental Geographic Information Systems   
372     GEOG 6304                 Geographical Information Systems I   
373       CEE 187                   Geographical Information Systems   
374     GEOG 6304                 Geographical Information Systems I   
375     PPUA 5263  Geographic Information Systems for Urban and R...   
376       CEE 187                   Geographical Information Systems   
408     EECE 7337                                 Information Theory   
409        EE 127                                 Information Theory   
410        EE 634        Principles of Information Theory and Co

In [245]:
copy = copy.drop(['Minor Cluster', 'Minor Cluster Name'], axis=1)
copy['Minor Cluster'] = copy['Cluster']
copy = copy.drop(['Cluster'], axis=1)
copy.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",9
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",0
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",27
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",9
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",9


In [252]:
for i in copy.index:
    if copy.iloc[i]['Minor Cluster']  == 0:
        copy['Minor Cluster Name'].iloc[i] = 'Visualization'
        copy['Major Cluster'].iloc[i] = 7
        copy['Major Cluster Name'].iloc[i] = 'Visualization'
    if copy.iloc[i]['Minor Cluster']  == 1:
        copy['Minor Cluster Name'].iloc[i] = 'Theory'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 7:
        copy['Minor Cluster Name'].iloc[i] = 'Modeling'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 11:
        copy['Minor Cluster Name'].iloc[i] = 'Stochastic Processes'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 22:
        copy['Minor Cluster Name'].iloc[i] = 'Probability'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 26:
        copy['Minor Cluster Name'].iloc[i] = 'Time Series'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 20:
        copy['Minor Cluster Name'].iloc[i] = 'Security'
        copy['Major Cluster'].iloc[i] = 0
        copy['Major Cluster Name'].iloc[i] = 'Computer Science'
    if copy.iloc[i]['Minor Cluster']  == 27:
        copy['Minor Cluster Name'].iloc[i] = 'Coding'
        copy['Major Cluster'].iloc[i] = 0
        copy['Major Cluster Name'].iloc[i] = 'Computer Science'
    if copy.iloc[i]['Minor Cluster']  == 13:
        copy['Minor Cluster Name'].iloc[i] = 'NLP'
        copy['Major Cluster'].iloc[i] = 1
        copy['Major Cluster Name'].iloc[i] = 'Machine Learning'
    if copy.iloc[i]['Minor Cluster']  == 28:
        copy['Minor Cluster Name'].iloc[i] = 'Deep Learning'
        copy['Major Cluster'].iloc[i] = 1
        copy['Major Cluster Name'].iloc[i] = 'Machine Learning'
    if copy.iloc[i]['Minor Cluster']  == 9:
        copy['Minor Cluster Name'].iloc[i] = 'Algorithms/Optimization'
        copy['Major Cluster'].iloc[i] = 3
        copy['Major Cluster Name'].iloc[i] = 'Math'
    if copy.iloc[i]['Minor Cluster']  == 18:
        copy['Minor Cluster Name'].iloc[i] = 'Theory'
        copy['Major Cluster'].iloc[i] = 3
        copy['Major Cluster Name'].iloc[i] = 'Math'
    if copy.iloc[i]['Minor Cluster']  == 12:
        copy['Minor Cluster Name'].iloc[i] = 'Data Warehousing'
        copy['Major Cluster'].iloc[i] = 4
        copy['Major Cluster Name'].iloc[i] = 'Data Warehousing'
    if copy.iloc[i]['Minor Cluster']  == 8:
        copy['Minor Cluster Name'].iloc[i] = 'Software/OS'
        copy['Major Cluster'].iloc[i] = 5
        copy['Major Cluster Name'].iloc[i] = 'HPC'
    if copy.iloc[i]['Minor Cluster']  == 17:
        copy['Minor Cluster Name'].iloc[i] = 'Parallel Computing'
        copy['Major Cluster'].iloc[i] = 5
        copy['Major Cluster Name'].iloc[i] = 'HPC'
    if copy.iloc[i]['Minor Cluster']  == 15:
        copy['Minor Cluster Name'].iloc[i] = 'Cloud Computing'
        copy['Major Cluster'].iloc[i] = 5
        copy['Major Cluster Name'].iloc[i] = 'HPC'
    if copy.iloc[i]['Minor Cluster']  == 24:
        copy['Minor Cluster Name'].iloc[i] = 'Data Mining'
        copy['Major Cluster'].iloc[i] = 6
        copy['Major Cluster Name'].iloc[i] = 'Data Mining'
    if copy.iloc[i]['Minor Cluster']  == 10:
        copy['Minor Cluster Name'].iloc[i] = 'Business/Social Science'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 19:
        copy['Minor Cluster Name'].iloc[i] = 'Geospatial'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 25:
        copy['Minor Cluster Name'].iloc[i] = 'Thesis'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 5:
        copy['Minor Cluster Name'].iloc[i] = 'Data Science'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 2:
        copy['Minor Cluster Name'].iloc[i] = 'Biology'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 16:
        copy['Minor Cluster Name'].iloc[i] = 'Other'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [254]:
copy.to_csv('university_clusters.csv')