In [1]:
import pandas as pd
import nltk
from collections import Counter
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.externals import joblib
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer 
from nltk.stem import PorterStemmer
lemmatizer = WordNetLemmatizer() 
ps = PorterStemmer()

university_data = pd.read_excel('Labeled Courses Orig.xlsx', header=0)
university_data

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster,Minor Cluster Name
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",15,Algorithms
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",15,Algorithms
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",15,Algorithms
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",15,Algorithms
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",17,Algorithms
5,CS 430,Introduction to Algorithms,"Introduction to the design, behavior, and anal...",Illinois Institute of Tech,"Chicago, Illinois",17,Algorithms
6,CS 5800,Algorithms,Presents the mathematical techniques used for ...,Northeastern,"Boston, Massachusetts",17,Algorithms
7,CME 309,Randomized Algorithms and Probabilistic Analysis,Randomness pervades the natural processes arou...,Stanford,"Stanford, California",17,Algorithms
8,COMP 160,Algorithms,Introduction to the study of algorithms. Strat...,Tufts,"Medford, Massachusetts",17,Algorithms
9,CSE 633,Parallel Algorithms,"The course will focus on the design, implement...",University at Buffalo,"Buffalo, New York",17,Algorithms


In [2]:
university_data['Course Description Stripped'] = university_data['Course Description'].apply(lambda x: x.lower())
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(lambda x: nltk.tokenize.word_tokenize(x))
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(lambda x: nltk.pos_tag(x))
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(lambda x: [t for t, pos in x if pos == 'NN'])
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(lambda x: [lemmatizer.lemmatize(t) for t in x])
university_data['Course Description Stripped'] = university_data['Course Description Stripped'].apply(', '.join)

In [3]:
university_data.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster,Minor Cluster Name,Course Description Stripped
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",15,Algorithms,"design, analysis, algorithm, paradigm, graph, ..."
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",15,Algorithms,"design, algorithm, variety, proof, correctness..."
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",15,Algorithms,"modeling, design, world, web, gene, modeling, ..."
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",15,Algorithms,"design, efficiency, divide-and-conquer, proces..."
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",17,Algorithms,"course, level, introduction, design, analysis,..."


In [33]:
# # Vectorizer

vectorizer = TfidfVectorizer(stop_words='english', lowercase=False)
X = vectorizer.fit_transform(list(university_data['Course Description Stripped']))
terms = vectorizer.get_feature_names()

# Kmeans Model
model = KMeans(n_clusters=12, max_iter=20000, random_state=2) 
model.fit(X)
clusters = model.labels_.tolist()

# Model save
joblib.dump(model, 'desc_cluster_nouns.pkl')

['desc_cluster_nouns.pkl']

In [4]:
# Get Model
model = joblib.load('desc_cluster_nouns.pkl')
clusters = model.labels_.tolist()

university_data['Cluster'] = clusters
university_data = university_data.reset_index(drop=True)

In [5]:
university_data

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster,Minor Cluster Name,Course Description Stripped,Cluster
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",15,Algorithms,"design, analysis, algorithm, paradigm, graph, ...",0
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",15,Algorithms,"design, algorithm, variety, proof, correctness...",0
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",15,Algorithms,"modeling, design, world, web, gene, modeling, ...",0
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",15,Algorithms,"design, efficiency, divide-and-conquer, proces...",0
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",17,Algorithms,"course, level, introduction, design, analysis,...",4
5,CS 430,Introduction to Algorithms,"Introduction to the design, behavior, and anal...",Illinois Institute of Tech,"Chicago, Illinois",17,Algorithms,"introduction, design, behavior, analysis, comp...",0
6,CS 5800,Algorithms,Presents the mathematical techniques used for ...,Northeastern,"Boston, Massachusetts",17,Algorithms,"design, analysis, computer, algorithm, design,...",0
7,CME 309,Randomized Algorithms and Probabilistic Analysis,Randomness pervades the natural processes arou...,Stanford,"Stanford, California",17,Algorithms,"randomness, formation, recombination, randomne...",0
8,COMP 160,Algorithms,Introduction to the study of algorithms. Strat...,Tufts,"Medford, Massachusetts",17,Algorithms,"introduction, study, algorithm, divide-and-con...",0
9,CSE 633,Parallel Algorithms,"The course will focus on the design, implement...",University at Buffalo,"Buffalo, New York",17,Algorithms,"course, design, implementation, analysis, solu...",0


In [6]:
university_data = university_data.drop(['Minor Cluster', 'Minor Cluster Name'], axis=1)
university_data['Minor Cluster'] = university_data['Cluster']
university_data = university_data.drop(['Cluster'], axis=1)
university_data['Minor Cluster Name'] = ''
university_data['Major Cluster'] = ''
university_data['Major Cluster Name'] = ''
university_data.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Course Description Stripped,Minor Cluster,Minor Cluster Name,Major Cluster,Major Cluster Name
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina","design, analysis, algorithm, paradigm, graph, ...",0,,,
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois","design, algorithm, variety, proof, correctness...",0,,,
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California","modeling, design, world, web, gene, modeling, ...",0,,,
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York","design, efficiency, divide-and-conquer, proces...",0,,,
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia","course, level, introduction, design, analysis,...",4,,,


In [39]:
for i in university_data['Minor Cluster'].unique():
    print('Minor Cluster:',i)
    print('Course Titles:', university_data.loc[university_data['Minor Cluster'] == i, 'Course Title'])
    print()

Minor Cluster: 0
Course Titles: 0                      Design and Analysis of Algorithms
1                      Design and Analysis of Algorithms
2      Large-Scale Social and Complex Networks: Desig...
3              Design & Analysis of Efficient Algorithms
5                             Introduction to Algorithms
6                                             Algorithms
7       Randomized Algorithms and Probabilistic Analysis
8                                             Algorithms
9                                    Parallel Algorithms
10                              Approximation Algorithms
11                                Analysis of Algorithms
12                     Design and Analysis of Algorithms
13                     Design and Analysis of Algorithms
16                   Discrete Mathematics and Algorithms
18     Large-Scale Social and Complex Networks: Desig...
19                                   Advanced Algorithms
20             Design & Analysis of Efficient Algorithms

In [7]:
for i in university_data.index:
    if university_data.iloc[i]['Minor Cluster']  == 0:
        university_data['Minor Cluster Name'].iloc[i] = 'Algorithms'
        university_data['Major Cluster'].iloc[i] = 0
        university_data['Major Cluster Name'].iloc[i] = 'Math'
    if university_data.iloc[i]['Minor Cluster']  == 4:
        university_data['Minor Cluster Name'].iloc[i] = 'Software'
        university_data['Major Cluster'].iloc[i] = 1
        university_data['Major Cluster Name'].iloc[i] = 'HPC'
    if university_data.iloc[i]['Minor Cluster']  == 3:
        university_data['Minor Cluster Name'].iloc[i] = 'Security'
        university_data['Major Cluster'].iloc[i] = 2
        university_data['Major Cluster Name'].iloc[i] = 'Computer Science'
    if university_data.iloc[i]['Minor Cluster']  == 1:
        university_data['Minor Cluster Name'].iloc[i] = 'Other'
        university_data['Major Cluster'].iloc[i] = 3
        university_data['Major Cluster Name'].iloc[i] = 'Topics'
    if university_data.iloc[i]['Minor Cluster']  == 9:
        university_data['Minor Cluster Name'].iloc[i] = 'Machine Learning'
        university_data['Major Cluster'].iloc[i] = 4
        university_data['Major Cluster Name'].iloc[i] = 'Machine Learning'
    if university_data.iloc[i]['Minor Cluster']  == 7:
        university_data['Minor Cluster Name'].iloc[i] = 'Data Science'
        university_data['Major Cluster'].iloc[i] = 3
        university_data['Major Cluster Name'].iloc[i] = 'Topics'
    if university_data.iloc[i]['Minor Cluster']  == 2:
        university_data['Minor Cluster Name'].iloc[i] = 'Visualization'
        university_data['Major Cluster'].iloc[i] = 5
        university_data['Major Cluster Name'].iloc[i] = 'Visualization'
    if university_data.iloc[i]['Minor Cluster']  == 6:
        university_data['Minor Cluster Name'].iloc[i] = 'General'
        university_data['Major Cluster'].iloc[i] = 6
        university_data['Major Cluster Name'].iloc[i] = 'Statistics'
    if university_data.iloc[i]['Minor Cluster']  == 11:
        university_data['Minor Cluster Name'].iloc[i] = 'Modeling'
        university_data['Major Cluster'].iloc[i] = 6
        university_data['Major Cluster Name'].iloc[i] = 'Statistics'
    if university_data.iloc[i]['Minor Cluster']  == 8:
        university_data['Minor Cluster Name'].iloc[i] = 'Probability'
        university_data['Major Cluster'].iloc[i] = 6
        university_data['Major Cluster Name'].iloc[i] = 'Statistics'
    if university_data.iloc[i]['Minor Cluster']  == 10:
        university_data['Minor Cluster Name'].iloc[i] = 'Data Warehousing'
        university_data['Major Cluster'].iloc[i] = 10
        university_data['Major Cluster Name'].iloc[i] = 'Data Warehousing'
    if university_data.iloc[i]['Minor Cluster']  == 5:
        university_data['Minor Cluster Name'].iloc[i] = 'Economics'
        university_data['Major Cluster'].iloc[i] = 3
        university_data['Major Cluster Name'].iloc[i] = 'Topics'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
university_data

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Course Description Stripped,Minor Cluster,Minor Cluster Name,Major Cluster,Major Cluster Name
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina","design, analysis, algorithm, paradigm, graph, ...",0,Algorithms,0,Math
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois","design, algorithm, variety, proof, correctness...",0,Algorithms,0,Math
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California","modeling, design, world, web, gene, modeling, ...",0,Algorithms,0,Math
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York","design, efficiency, divide-and-conquer, proces...",0,Algorithms,0,Math
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia","course, level, introduction, design, analysis,...",4,Software,1,HPC
5,CS 430,Introduction to Algorithms,"Introduction to the design, behavior, and anal...",Illinois Institute of Tech,"Chicago, Illinois","introduction, design, behavior, analysis, comp...",0,Algorithms,0,Math
6,CS 5800,Algorithms,Presents the mathematical techniques used for ...,Northeastern,"Boston, Massachusetts","design, analysis, computer, algorithm, design,...",0,Algorithms,0,Math
7,CME 309,Randomized Algorithms and Probabilistic Analysis,Randomness pervades the natural processes arou...,Stanford,"Stanford, California","randomness, formation, recombination, randomne...",0,Algorithms,0,Math
8,COMP 160,Algorithms,Introduction to the study of algorithms. Strat...,Tufts,"Medford, Massachusetts","introduction, study, algorithm, divide-and-con...",0,Algorithms,0,Math
9,CSE 633,Parallel Algorithms,"The course will focus on the design, implement...",University at Buffalo,"Buffalo, New York","course, design, implementation, analysis, solu...",0,Algorithms,0,Math


In [9]:
university_data.to_csv('university_clusters_NOUNS.csv')

## All code below is for cleaning/recategorizing clusters. If clusters are fine, do not run code below

In [236]:
copy = university_data.copy()
copy.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster,Minor Cluster Name,Cluster
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",15,Algorithms,9
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",15,Algorithms,6
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",15,Algorithms,2
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",15,Algorithms,9
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",17,Algorithms,16


In [237]:
# Merge repetitive clusters together 
copy.loc[copy['Cluster'] == 28, 'Cluster'] = 3 # Merge Machine Learning clusters
copy.loc[copy['Cluster'] == 14, 'Cluster'] = 10 # Merge Topics- Social Science/Business clusters
copy.loc[copy['Cluster'] == 21, 'Cluster'] = 16 # Merge Topics- other clusters
copy.loc[copy['Cluster'] == 15, 'Cluster'] = 1 # Stats- Methods
copy.loc[copy['Cluster'] == 3, 'Cluster'] = 28 # Deep Learning/ML
copy.loc[copy['Cluster'] == 29, 'Cluster'] = 0 # Visualization
copy.loc[copy['Cluster'] == 23, 'Cluster'] = 1 # Stats
copy.loc[copy['Cluster'] == 6, 'Cluster'] = 16 # Other

In [238]:
# Random/Unknown Cluster Cleaning 

for i in copy.index:
    if copy.iloc[i]['Cluster']  == 2 or copy.iloc[i]['Cluster']  == 4 or copy.iloc[i]['Cluster']  == 5 or copy.iloc[i]['Cluster']  == 6 or copy.iloc[i]['Cluster']  == 8 or copy.iloc[i]['Cluster']  == 14 or copy.iloc[i]['Cluster']  == 15 or copy.iloc[i]['Cluster']  == 16 or copy.iloc[i]['Cluster']  == 21:
        if any(c in copy['Course Title'].iloc[i] for c in ['Algorithm','Optimization']):
            copy['Cluster'].iloc[i] = 9 # Algorithms/Optimization
        if any(c in copy['Course Title'].iloc[i] for c in ['Visual','Vision','Computer Vision','Animation', 'Graph', 'Design']):
            copy['Cluster'].iloc[i] = 0 # Visualization
        if any(c in copy['Course Title'].iloc[i] for c in ['Text','Natural Language','Linguistic', 'Language', 'Speech']):
            copy['Cluster'].iloc[i] = 13 # Natural Language Processing
        if any(c in copy['Course Title'].iloc[i] for c in ['Spatial','Geospatial','GPS', 'Aerial', 'Imaging']):
            copy['Cluster'].iloc[i] = 19 # Topics- Geospatial
        if any(c in copy['Course Title'].iloc[i] for c in ['Bayesian']):
            copy['Cluster'].iloc[i] = 23 # Stats - Theory 
        if any(c in copy['Course Title'].iloc[i] for c in ['Machine Learning', 'Deep Learning', 'Neural Networks']):
            copy['Cluster'].iloc[i] = 28 # Deep Learning/Machine Learning
        if any(c in copy['Course Title'].iloc[i] for c in ['Capstone', 'Practicum', 'Seminar', 'Study']):
            copy['Cluster'].iloc[i] = 25 # Topics - Thesis
        if any(c in copy['Course Title'].iloc[i] for c in ['Big Data', 'Warehousing', 'Data Management', 'Database', 'Data Storage', 'Data Engineer', 'Data Stores']):
            copy['Cluster'].iloc[i] = 12 # Database/Data Warehousing
        if any(c in copy['Course Title'].iloc[i] for c in ['R', 'Python', 'Unix', 'Linux', 'C', 'Programming', 'Coding']):
            copy['Cluster'].iloc[i] = 27 # Coding
        if any(c in copy['Course Title'].iloc[i] for c in ['Data Mining', 'Data Munging', 'Data Cleaning']):
            copy['Cluster'].iloc[i] = 24 # Data Mining
        if any(c in copy['Course Title'].iloc[i] for c in ['Algebra', 'Matrix', 'Math', 'Calculus', 'Derivative', 'Differential', 'Vector']):
            copy['Cluster'].iloc[i] = 18 # Math - Theory 
        if any(c in copy['Course Title'].iloc[i] for c in ['Society', 'Social Media', 'Social Network', 'Politic', 'Econ', 'Business', 'Finance', 'Market', 'Public', 'Policy', 'Entrepreneur', 'Manage', 'Global', 'Environment', 'Ethics', 'Communication', 'Health', 'Medicine', 'Humanities', 'Survival']):
            copy['Cluster'].iloc[i] = 10 # Topics - Social Science/Business
        if any(c in copy['Course Title'].iloc[i] for c in ['Model', 'Regression', 'Multivariate', 'GLM']):
            copy['Cluster'].iloc[i] = 7 # Stats- Modeling
        if any(c in copy['Course Title'].iloc[i] for c in ['Data Science', 'Analytics']):
            copy['Cluster'].iloc[i] = 5 # Topics- Data Science
        if any(c in copy['Course Title'].iloc[i] for c in ['Bio', 'Neuro', 'Biology', 'Epidemiology', 'Biostat', 'Bioinformatics', 'Genom', 'Molecul']):
            copy['Cluster'].iloc[i] = 2 # Topics- Biology
        if any(c in copy['Course Title'].iloc[i] for c in ['Parallel', 'Distributed']):
            copy['Cluster'].iloc[i] = 17 # Parallel/Distributed Systems
        if any(c in copy['Course Title'].iloc[i] for c in ['OS', 'Operating System', 'Information Systems', 'Information Technology', 'Information Theory']):
            copy['Cluster'].iloc[i] = 8 # Software/OS
        if any(c in copy['Course Title'].iloc[i] for c in ['Acquisition', 'ETL', 'Database', 'Data Engineer', 'Integration', 'Retrieval']):
            copy['Cluster'].iloc[i] = 12 # Data Warehousing
        if any(c in copy['Course Title'].iloc[i] for c in ['Cloud']):
            copy['Cluster'].iloc[i] = 15 # Cloud Computing
        if any(c in copy['Course Title'].iloc[i] for c in ['Security', 'Computer Network', 'Computer System', 'Cyber', 'Crypt']):
            copy['Cluster'].iloc[i] = 20 # Computer/Network Security

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [239]:
# Additional Cleaning
for i, row in copy.iterrows():
    if any(c in copy['Course Title'].iloc[i] for c in ['Acquisition', 'ETL', 'Database', 'Data Engineer', 'Integration', 'Retrieval']):
            copy['Cluster'].iloc[i] = 12 # Data Warehousing

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [240]:
for i in copy.Cluster.unique():
    print('Cluster:',i)
    print('Course Titles:', copy.loc[copy['Cluster'] == i, 'Course Title'])
    print()

Cluster: 9
Course Titles: 0                Design and Analysis of Algorithms
3        Design & Analysis of Efficient Algorithms
4                                       Algorithms
5                       Introduction to Algorithms
6                                       Algorithms
8                                       Algorithms
10                        Approximation Algorithms
11                          Analysis of Algorithms
12               Design and Analysis of Algorithms
15                             Advanced Algorithms
16             Discrete Mathematics and Algorithms
19                             Advanced Algorithms
20       Design & Analysis of Efficient Algorithms
22                             Advanced Algorithms
23             Discrete Mathematics and Algorithms
24                             Advanced Algorithms
25                  Data Structures and Algorithms
26                     Algorithms for Data Science
28                    Data Structrues & Algorithms
29   

In [201]:
#print(copy[copy["Cluster"]==8])
#print("Cluster 16 - Unknown:", copy[copy["Cluster"]==16].count()['Cluster'])

        Course ID                                       Course Title  \
163       CPE 691                       Information Systems Security   
166       CPE 691                       Information Systems Security   
244       INF 529                Security and Privacy in Informatics   
307   COMPSCI 677                  Distributed and Operating Systems   
371      ENVS-655       Environmental Geographic Information Systems   
372     GEOG 6304                 Geographical Information Systems I   
373       CEE 187                   Geographical Information Systems   
374     GEOG 6304                 Geographical Information Systems I   
375     PPUA 5263  Geographic Information Systems for Urban and R...   
376       CEE 187                   Geographical Information Systems   
408     EECE 7337                                 Information Theory   
409        EE 127                                 Information Theory   
410        EE 634        Principles of Information Theory and Co

In [245]:
copy = copy.drop(['Minor Cluster', 'Minor Cluster Name'], axis=1)
copy['Minor Cluster'] = copy['Cluster']
copy = copy.drop(['Cluster'], axis=1)
copy.head()

Unnamed: 0,Course ID,Course Title,Course Description,University,Location,Minor Cluster
0,COMPSCI 532,Design and Analysis of Algorithms,Design and analysis of efficient algorithms. A...,Duke,"Durham, North Carolina",9
1,CS 535,Design and Analysis of Algorithms,Design of efficient algorithms for a variety o...,Illinois Institute of Tech,"Chicago, Illinois",0
2,EC EGNR 232E,Large-Scale Social and Complex Networks: Desig...,Modeling and design of large-scale complex net...,UCLA,"Los Angeles, California",27
3,CSC 482,Design & Analysis of Efficient Algorithms,How does one design programs and ascertain the...,University of Rochester,"Rochester, New York",9
4,CS 526,Algorithms,This course is a graduate level introduction t...,Emory,"Atlanta, Georgia",9


In [252]:
for i in copy.index:
    if copy.iloc[i]['Minor Cluster']  == 0:
        copy['Minor Cluster Name'].iloc[i] = 'Visualization'
        copy['Major Cluster'].iloc[i] = 7
        copy['Major Cluster Name'].iloc[i] = 'Visualization'
    if copy.iloc[i]['Minor Cluster']  == 1:
        copy['Minor Cluster Name'].iloc[i] = 'Theory'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 7:
        copy['Minor Cluster Name'].iloc[i] = 'Modeling'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 11:
        copy['Minor Cluster Name'].iloc[i] = 'Stochastic Processes'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 22:
        copy['Minor Cluster Name'].iloc[i] = 'Probability'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 26:
        copy['Minor Cluster Name'].iloc[i] = 'Time Series'
        copy['Major Cluster'].iloc[i] = 2
        copy['Major Cluster Name'].iloc[i] = 'Statistics'
    if copy.iloc[i]['Minor Cluster']  == 20:
        copy['Minor Cluster Name'].iloc[i] = 'Security'
        copy['Major Cluster'].iloc[i] = 0
        copy['Major Cluster Name'].iloc[i] = 'Computer Science'
    if copy.iloc[i]['Minor Cluster']  == 27:
        copy['Minor Cluster Name'].iloc[i] = 'Coding'
        copy['Major Cluster'].iloc[i] = 0
        copy['Major Cluster Name'].iloc[i] = 'Computer Science'
    if copy.iloc[i]['Minor Cluster']  == 13:
        copy['Minor Cluster Name'].iloc[i] = 'NLP'
        copy['Major Cluster'].iloc[i] = 1
        copy['Major Cluster Name'].iloc[i] = 'Machine Learning'
    if copy.iloc[i]['Minor Cluster']  == 28:
        copy['Minor Cluster Name'].iloc[i] = 'Deep Learning'
        copy['Major Cluster'].iloc[i] = 1
        copy['Major Cluster Name'].iloc[i] = 'Machine Learning'
    if copy.iloc[i]['Minor Cluster']  == 9:
        copy['Minor Cluster Name'].iloc[i] = 'Algorithms/Optimization'
        copy['Major Cluster'].iloc[i] = 3
        copy['Major Cluster Name'].iloc[i] = 'Math'
    if copy.iloc[i]['Minor Cluster']  == 18:
        copy['Minor Cluster Name'].iloc[i] = 'Theory'
        copy['Major Cluster'].iloc[i] = 3
        copy['Major Cluster Name'].iloc[i] = 'Math'
    if copy.iloc[i]['Minor Cluster']  == 12:
        copy['Minor Cluster Name'].iloc[i] = 'Data Warehousing'
        copy['Major Cluster'].iloc[i] = 4
        copy['Major Cluster Name'].iloc[i] = 'Data Warehousing'
    if copy.iloc[i]['Minor Cluster']  == 8:
        copy['Minor Cluster Name'].iloc[i] = 'Software/OS'
        copy['Major Cluster'].iloc[i] = 5
        copy['Major Cluster Name'].iloc[i] = 'HPC'
    if copy.iloc[i]['Minor Cluster']  == 17:
        copy['Minor Cluster Name'].iloc[i] = 'Parallel Computing'
        copy['Major Cluster'].iloc[i] = 5
        copy['Major Cluster Name'].iloc[i] = 'HPC'
    if copy.iloc[i]['Minor Cluster']  == 15:
        copy['Minor Cluster Name'].iloc[i] = 'Cloud Computing'
        copy['Major Cluster'].iloc[i] = 5
        copy['Major Cluster Name'].iloc[i] = 'HPC'
    if copy.iloc[i]['Minor Cluster']  == 24:
        copy['Minor Cluster Name'].iloc[i] = 'Data Mining'
        copy['Major Cluster'].iloc[i] = 6
        copy['Major Cluster Name'].iloc[i] = 'Data Mining'
    if copy.iloc[i]['Minor Cluster']  == 10:
        copy['Minor Cluster Name'].iloc[i] = 'Business/Social Science'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 19:
        copy['Minor Cluster Name'].iloc[i] = 'Geospatial'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 25:
        copy['Minor Cluster Name'].iloc[i] = 'Thesis'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 5:
        copy['Minor Cluster Name'].iloc[i] = 'Data Science'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 2:
        copy['Minor Cluster Name'].iloc[i] = 'Biology'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'
    if copy.iloc[i]['Minor Cluster']  == 16:
        copy['Minor Cluster Name'].iloc[i] = 'Other'
        copy['Major Cluster'].iloc[i] = 8
        copy['Major Cluster Name'].iloc[i] = 'Topics'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [254]:
copy.to_csv('university_clusters.csv')