In [61]:
import seaborn as sns
from sklearn.model_selection import train_test_split
import pyprind
import pandas as pd
import os
import numpy as np
import re # python regular expressions
import string # for efficient operations with strings
import matplotlib.pyplot as plt
%matplotlib inline
from collections import defaultdict
import nltk
nltk.download('punkt') # you will probably need to do this
nltk.download('wordnet') # and this
nltk.download('stopwords') # aand this
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.spatial import distance
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from vaderSentiment import vaderSentiment
from afinn import Afinn
from IPython.display import Image

[nltk_data] Downloading package punkt to /Users/mieharder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mieharder/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mieharder/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Collecting data

In [None]:
""" This cell will save name and ranking of American Universities from US News official 
    ranking 2021 in a dataframe. From here, the cell randomly chooses 10 universities 
    from the bottom of the list and prints them. """

ranking=[]
name=[]

# Create a request interceptor
def interceptor(request):
    del request.headers['Referer']  # Delete the header first
    request.headers['Referer'] = 'For an exam at the university of copenhagen for the course ISDS'

# Set the interceptor on the driver

#Få den første
for j in range(round(338/10)+1):
    url = ('https://www.usnews.com/best-colleges/rankings/national-universities?_page=' + str(j+1))
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.request_interceptor = interceptor
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'lxml')
    driver.close()
    clasees = soup.findAll('div', {'class':'Box-w0dun1-0 DetailCardColleges__TextContainer-cecerc-4 CYZBT eoCwZT'})
    class_name = soup.findAll('a', {'class':'Anchor-byh49a-0 DetailCardColleges__StyledAnchor-cecerc-7 fidpEI eKrerU card-name'})

    for i in range(len(clasees)):
        temp = clasees[i].text
        ranking.append(re.findall('\d+', temp)[0])
        name.append(class_name[i].text.strip())

df = pd.DataFrame(list(zip(ranking, name)),
               columns =['Ranking', 'Name'])


#Get subset with ranking 298 (there are 92)
bottom_df = df[df["Ranking"] == '298']

#Draw a random 10
ten_bottom = bottom_df.sample(10)

print(ten_bottom)

In [1]:
dict_of_schools={'Princeton University':{'Number of professors':326 , 'School ID':780},
                 'Harvard University':{'Number of professors': 567, 'School ID':399},
                 'Columbia University':{'Number of professors': 893, 'School ID':278},
                 'Massachusetts Institute of Technology':{'Number of professors':346 , 'School ID':580},
                 'Yale University':{'Number of professors':464 , 'School ID':1222},
                 'Stanford University':{'Number of professors':611 , 'School ID':953},
                 'University of Chicago':{'Number of professors':507 , 'School ID':1085},
                 'University of Pennsylvania':{'Number of professors':523 , 'School ID':169},
                 'California Institute of Technology':{'Number of professors':93 , 'School ID':148},
                 'Johns Hopkins University':{'Number of professors':830 , 'School ID':464},
                 
                 'Wichita State University':{'Number of professors':1713 , 'School ID':1197},
                 'Western Kentucky University':{'Number of professors':1669 , 'School ID':1176},
                 'University of Charleston':{'Number of professors':94 , 'School ID':1084},
                 'Regent University':{'Number of professors':485 , 'School ID':4375},
                 'University of Hawaii at Hilo ':{'Number of professors':435 , 'School ID':1105},
                 'Cleveland State University ':{'Number of professors':1862 , 'School ID':244},
                 'Husson University':{'Number of professors':221 , 'School ID':426},
                 'Palm Beach Atlantic University':{'Number of professors':357 , 'School ID':753},
                 'Texas Womans University':{'Number of professors':1311 , 'School ID':1014},
                 'University of Texas at Tyler':{'Number of professors':717 , 'School ID':4171},
                }

In [None]:
def get_all(school_id, number_of_professors):
    """
    args: school_id: integer
          number_of_professors: integer
    returns: None
    
    This function takes as an argument, a school ID and the number of professors at the corresponding school.
    It then reads the URL corresponding the school and saves a dataframe containing information of the given school
    and their professors into a csv file.
    From the dataframe it makes a list of teacher ID's, which are used to access all professors RMP page and scraping
    comments regarding each professor.
    Lastly it saves the comments in a second dataframe, which is then saved as a csv file.
    
    """
    
    number_of_pages=math.ceil(number_of_professors/20)
    list_of_dfs=[]
    for j in range(1,number_of_pages+1): #all page numbers
        url=f"https://www.ratemyprofessors.com/filter/professor/?&page={j}&filter=teacherlastname_sort_s+asc&query=*%3A*&queryoption=TEACHER&queryBy=schoolId&sid="+str(school_id)
        response = requests.get(url) #this response is a dict of dicts.
        list_of_dicts=response.json() #keys are: ['professors', 'searchResultsTotal', 'remaining', 'type']
        list_of_professor_dicts=list_of_dicts['professors'] #enter the dict with info regarding professors
        list_of_dfs.append(pd.concat([pd.DataFrame(list_of_dicts['professors'] [i], index=[i]) for i in range(len(list_of_professor_dicts))],
                  ignore_index=True)) #appends a dataframe with information of all professors on the given page to list
    df=pd.concat([list_of_dfs[i] for i in range(len(list_of_dfs))], axis= 0).reset_index(drop=True) #after going through all pages. Concatenate all df's into one
    list_of_teacher_ids=list(df.tid) #create a list of all professor id's for the given university
    list_of_comments_for_each_teacher=[]
    list_of_tags_for_each_teacher=[]
    for i in range(len(list_of_teacher_ids)):
        my_url = f'https://www.ratemyprofessors.com/ShowRatings.jsp?tid={list_of_teacher_ids[i]}'#enter URL for each professor
        my_response = requests.get(my_url)
        html = my_response.text
        soup = BeautifulSoup(html,'lxml')
        my_text = soup.findAll('div', {'class':'Comments__StyledComments-dzzyvm-0 gRjWel'})#list of comments unparsed
        my_tags = soup.findAll('span', {'class':'Tag-bs9vf4-0 hHOVKF'})#list of comments unparsed
        finished_comments=[i.text for i in my_text]
        finished_tags=[i.text for i in my_tags]
        list_of_comments_for_each_teacher.append(finished_comments)
        list_of_tags_for_each_teacher.append(finished_tags)
    df['Comments']=list_of_comments_for_each_teacher #add a column to dataframe where each cell contains a list of all comments for the given professor.
    df['Tags']=list_of_tags_for_each_teacher#add a column to dataframe where each cell contains a list of all tags for the given professor.
    df.to_csv(f'{school_id}.csv') #saves the dataframe as a csv file
    
    ##Second dataframe
    list_of_comments_and_ids=[]
    for i in range(len(list_of_teacher_ids)):
        my_url = f'https://www.ratemyprofessors.com/ShowRatings.jsp?tid={list_of_teacher_ids[i]}'
        my_response = requests.get(my_url, headers={'user-agent': 'For an exam at the university of copenhagen for the course ISDS'})
        html = my_response.text
        soup = BeautifulSoup(html,'lxml')
        my_comments = soup.findAll('div', {'class':'Comments__StyledComments-dzzyvm-0 gRjWel'})#list of tags unparsed
        my_date = soup.findAll('div', {'class':'TimeStamp__StyledTimeStamp-sc-9q2r30-0 bXQmMr RatingHeader__RatingTimeStamp-sc-1dlkqw1-3 BlaCV'})#list of dates unparsed
        my_quality = soup.findAll('div', {'class':'CardNumRating__StyledCardNumRating-sc-17t4b9u-0 eWZmyX'})#list of dates unparsed
        finished_quality=[i.text for i in my_quality]
        finished_comments=[i.text for i in my_comments]
        finished_dates=[i.text for i in my_date]
        if len(finished_comments)!=0:
            list_of_comments_and_ids.append([finished_comments, [k[-3:] for k in finished_quality[0::2]],[k[-3:] for k in finished_quality[1::2]], finished_dates[0::2],[list_of_teacher_ids[i] for j in range(len(finished_comments))]])
    hej=pd.concat([pd.DataFrame(list_of_comments_and_ids[i]).T for i in range(len(list_of_comments_and_ids))]) #creates the dataframe
    hej.columns=['Comments', 'Quality','Difficulty','Dates', 'TeacherID']
    hej=hej.reset_index(drop=True)
    hej.to_csv(f'{school_id}comments.csv')
    return 


In [None]:
#For running the get_all function for all universities. Be ware that running this takes several hours.

for i in list(dict_of_schools.keys()):
    print(i)
    school_to_run=i
    number_of_prof=dict_of_schools[school_to_run]['Number of professors']
    schoolID=dict_of_schools[school_to_run]['School ID']
    get_all(schoolID, number_of_prof)

## Parsing data

In [82]:
#Concatinates all professor dataframes to one and saves as csv file
all_df=[]
for i in list(dict_of_schools.keys()):
    nr = dict_of_schools[i]['School ID']
    df = pd.read_csv(f'initialcsvfiles/{nr}.csv')
    df['SchoolID'] = nr
    all_df.append(df)
    
df_all = pd.concat(all_df)
df_all.to_csv('concatinated_df')


#Concatinates all comment dataframes to one and saves as csv file
all_df=[]
for i in list(dict_of_schools.keys()):
    nr = dict_of_schools[i]['School ID']
    df = pd.read_csv(f'initialcsvfiles/{nr}comments.csv')
    df['SchoolID'] = nr
    all_df.append(df)
    
df_all = pd.concat(all_df)
df_all.to_csv('concatinated_comments_df')


In [83]:
#Make list of top and bottom school numbers, to categorize out schools in dataframe.
top = []
bottom = []

for i in range(len(list(dict_of_schools.keys()))):
    if i < 10:
        top.append(dict_of_schools[list(dict_of_schools.keys())[i]]['School ID'])
    else:
        bottom.append(dict_of_schools[list(dict_of_schools.keys())[i]]['School ID'])


In [84]:
concat_professor=pd.read_csv('concatinated_df')
concat_comment=pd.read_csv('concatinated_comments_df')

In [85]:
#Cleaning data and saves final df as csv
df = concat_professor.drop(['contentType', 'categoryType', 'Unnamed: 0.1', 'Unnamed: 0', 'tLname', 'tMiddlename', 'tFname', 'SchoolID' ], axis =1)
df['status'] = np.where(df['tSid'].isin(top), 'top', 'bottom') #adds a status column
df = df.dropna() #drops rows with NaN values
df.to_csv('df_final.csv')

In [86]:
#Cleaning data and saves final df as csv
df_no_comment = df[df['Comments'] == '[]'] 
df_comments = concat_comment.loc[~concat_comment['TeacherID'].isin(list(df_no_comment['tid']))]\
         .drop(['Unnamed: 0.1', 'Unnamed: 0' ], axis =1)#removes all rows where the teacher has no reviews in professor df
df_comments['Dates'] = pd.to_datetime(df_comments.Dates) #convert dates to datetime format
df_comments['status'] = np.where(df_comments['SchoolID'].isin(top), 'top', 'bottom') #adds a status column
df_comments = df_comments[df_comments['Comments'] != 'No Comments'] #removes no comments
df_comments.dropna(inplace=True) #remove rows with NaN 
df_comments.to_csv('df_comments_final.csv')

## Working with the data

In [87]:
#Loads the dataframes
df_p = pd.read_csv('df_final.csv')
df_c = pd.read_csv('df_comments_final.csv')

In [88]:
with open('STOPlist.txt') as f: #note that we made our own stop words list
    stoppelop = f.read().splitlines() #list of stop words

def preprocess(text):
    """Function to lemmatize and tokenize  text"""
    low_text= text.lower()
    low_text = low_text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(low_text)
    porter = nltk.WordNetLemmatizer()
    lemmatizer=[porter.lemmatize(t) for t in tokens]
    stop_words_list = stoppelop
    sent_sw_removed = [i for i in lemmatizer if i not in stop_words_list]
    lemmas=[i for i in sent_sw_removed if i!='br']
    return lemmas

In [89]:
#Make column with list of lemmatized and tokenized comments
df_c['Clean_comment']=df_c.apply(lambda row: preprocess(row.Comments), axis = 1)

## Logistic Regression

In [90]:
y= df_c['status']

X = df_c['Comments']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=161193)

In [91]:
#Count vectorizer
vectorizerc = CountVectorizer(tokenizer=preprocess)

X_train_c = vectorizerc.fit_transform(X_train)
X_test_c = vectorizerc.transform(X_test)

In [92]:
#TFIDF vectorizer
vectorizert = TfidfVectorizer(tokenizer=preprocess)

X_train_tf = vectorizert.fit_transform(X_train)
X_test_tf = vectorizert.transform(X_test)

### Lasso Regression word count

In [93]:
# classifier
lr = LogisticRegression(random_state=0, penalty = 'l1', solver = 'saga', max_iter=4000)

#training
lr.fit(X_train_c,y_train)

#testing
train_preds = lr.predict(X_train_c)
test_preds = lr.predict(X_test_c)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

training accuracy: 0.8260138065008499
testing accuracy: 0.7912373816213966


In [94]:
#Prints highest and lowest weighted features with their weights
features = ['_'.join(s.split()) for s in vectorizerc.get_feature_names()]
coefs_df = pd.DataFrame.from_records(lr.coef_, columns=features)
print(coefs_df.T.sort_values(by=[0], ascending=False).head(50), '\n',coefs_df.T.sort_values(by=[0], ascending=True).head(50) )

coefs_df.to_csv("coef_final.csv") #Saves the coefficients into a csv file

                         0
backtests         5.118388
declared          3.449085
inferential       3.191841
remarkably        3.135088
eloquent          2.916783
redemption        2.870802
recommed          2.792675
nobel             2.791912
recycled          2.625961
researeasch       2.592687
testsassignments  2.555164
sucess            2.546583
pset              2.444369
australian        2.394399
superfluous       2.365215
9am               2.327386
neuroscience      2.322469
coursera          2.308388
persian           2.269973
mistaken          2.261013
xcredit           2.253713
catchup           2.244231
caltech           2.209440
mentorship        2.205322
legendary         2.183779
arrogent          2.177932
jam               2.175048
china             2.174193
ditsy             2.163920
900               2.159900
smoked            2.154484
relevance         2.146844
expo20            2.130280
gd                2.129583
quarter           2.127280
summarized        2.127201
o

### Lasso Regression with TFIDF 

In [95]:
# classifier - 
lr = LogisticRegression(random_state=0, penalty = 'l1', solver = 'saga')

#training
lr.fit(X_train_tf,y_train)

#testing
train_preds = lr.predict(X_train_tf)
test_preds = lr.predict(X_test_tf)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

training accuracy: 0.8087383355881639
testing accuracy: 0.7955042147986263


### Ridge Regression word count

In [96]:
# classifier
lr = LogisticRegression(random_state=0, penalty = 'l2', solver = 'saga', max_iter=4000)

#training
lr.fit(X_train_c,y_train)

#testing
train_preds = lr.predict(X_train_c)
test_preds = lr.predict(X_test_c)
print("training accuracy:", np.mean([(train_preds==y_train)]))
print("testing accuracy:", np.mean([(test_preds==y_test)]))

training accuracy: 0.8507995975994728
testing accuracy: 0.7933707982100114


### Further statistics

In [97]:
#Make column with length of comments
df_c['len_col'] = df_c['Comments'].str.len()

In [98]:
print(df_c.groupby('status', as_index=False)['Quality'].std())
print(df_c.groupby('status', as_index=False)['Difficulty'].std())

print(df_c.groupby('status', as_index=False)['Difficulty'].mean())
print(df_c.groupby('status', as_index=False)['Quality'].mean())

print(df_c.groupby('status', as_index=False)['len_col'].mean())

   status   Quality
0  bottom  1.476532
1     top  1.486613
   status  Difficulty
0  bottom    1.246345
1     top    1.235679
   status  Difficulty
0  bottom    3.058642
1     top    3.084285
   status   Quality
0  bottom  3.649149
1     top  3.762763
   status     len_col
0  bottom  240.231233
1     top  189.453689
