In [2]:
import pandas as pd
import json
import requests
import time
import numpy as np
import re

# Data Pull/Parse

In [12]:
def get_data(school_id):
    id = school_id
    url_ex = f'http://www.ratemyprofessors.com/filter/professor/?&page=1&filter=teacherlastname_sort_s+asc&query=*%3A*&queryoption=TEACHER&queryBy=schoolId&sid={id}'
    response = requests.get(url_ex)
    data = json.loads(response.text)
    num_of_professors = data['searchResultsTotal']
    num_divided = num_of_professors / 20
    
    if isinstance(num_divided, float) == True:
        num_divided = int(num_divided + 1)
        
    professors = []    
    for i in range(1, num_divided):
        id = school_id
        url = f'http://www.ratemyprofessors.com/filter/professor/?&page={i}&filter=teacherlastname_sort_s+asc&query=*%3A*&queryoption=TEACHER&queryBy=schoolId&sid={id}'
        response = requests.get(url)
        data = json.loads(response.text)
        professors.append(data)
    new_list = []
    for i in range(len(professors)):
        prof_data = professors[i]['professors']
        new_list.append(prof_data)
    prof = []
    for i in range(len(new_list)):
        a = new_list[i]
        for j in range(len(a)):
            b = a[j]
            prof.append(b)
    
    prof_list = []
    
    for i in range(len(prof)):
            overall_rating = prof[i]['overall_rating']
            if overall_rating == 'N/A':
                continue
            else:
                school = prof[i]['institution_name']
                dept = prof[i]['tDept']
                name = prof[i]['tLname'] + ' ' + prof[i]['tFname']
                num_ratings = prof[i]['tNumRatings']
                rating = prof[i]['overall_rating']
                prof_tuple = (school, name, dept, float(rating), num_ratings)
                prof_list.append(prof_tuple)
    return prof_list

In [13]:
prof_list = get_data(758)

In [14]:
prof_list

[('Pennsylvania State University', 'Aalberts Robert', 'Law', 3.5, 15),
 ('Pennsylvania State University', 'Aas Erik', 'Mathematics', 2.3, 3),
 ('Pennsylvania State University', 'Abaci Uygar', 'Philosophy', 4.3, 11),
 ('Pennsylvania State University', 'Abalo Kodzovi', 'Economics', 2.8, 6),
 ('Pennsylvania State University',
  'Abar Caitlin',
  'Human Development',
  4.7,
  3),
 ('Pennsylvania State University',
  'Abbaas Omar',
  'Industrial Engineering',
  3.6,
  12),
 ('Pennsylvania State University', 'Abdullah Amir', 'Theater', 4.0, 1),
 ('Pennsylvania State University', 'Abdullah Ayesha', 'Philosophy', 3.6, 6),
 ('Pennsylvania State University', 'Abdullah Gary', 'Communication', 5.0, 2),
 ('Pennsylvania State University', 'Abel Jessamyn', 'History', 4.0, 9),
 ('Pennsylvania State University', 'Abel Jonathan', 'Literature', 3.8, 17),
 ('Pennsylvania State University', 'Abelbeck Hannah', 'English', 2.5, 7),
 ('Pennsylvania State University', 'Abler David', 'Economics', 2.5, 9),
 ('Pen

# DateFrame Creation

In [16]:
df=pd.DataFrame(prof_list)

In [27]:
df=df.rename(columns={0: "School", 1: "Professor", 2: "Department", 3:"Rating", 4:"Rating Count"})

In [132]:
df.to_csv('RateMyProfessor_Data.csv')

In [31]:
pd.set_option('display.max_rows', 1000)

In [133]:
df

Unnamed: 0,School,Professor,Department,Rating,Rating Count
0,Pennsylvania State University,Aalberts Robert,Law,3.5,15
1,Pennsylvania State University,Aas Erik,Mathematics,2.3,3
2,Pennsylvania State University,Abaci Uygar,Philosophy,4.3,11
3,Pennsylvania State University,Abalo Kodzovi,Economics,2.8,6
4,Pennsylvania State University,Abar Caitlin,Human Development,4.7,3
...,...,...,...,...,...
5020,Pennsylvania State University,Zhu Nan,Actuarial Science,3.5,4
5021,Pennsylvania State University,Zhu Sencun,Computer Science,2.3,16
5022,Pennsylvania State University,Ziegler Greg,Food Science,2.7,3
5023,Pennsylvania State University,Zietsma Charlene,Business,2.0,1


In [46]:
df_ratings = pd.DataFrame(df.groupby(['Department'])['Rating'].mean())

In [44]:
df_rating_counts = pd.DataFrame(df.groupby(['Department'])['Rating Count'].sum())

In [50]:
df_rating_counts

Unnamed: 0_level_0,Rating Count
Department,Unnamed: 1_level_1
Accounting,1034
Actuarial Science,19
Aerospace Engineering,127
African-American Studies,182
Africana Studies,30
Agriculture,331
Animal Science,156
Anthropology,578
Arabic,22
Architectural Engineering,2


In [51]:
df_total = pd.merge(df_ratings, df_rating_counts, on='Department')

# Cleaning/Export to CSV

In [77]:
df_total.shape

(117, 2)

In [54]:
updates = [{'Department':'Classical & Medieval Studies', 'Rating': 4.4821471041756356, 'Rating Count': 66},
           {'Department': 'African Studies', 'Rating': 3.8477987579579627, 'Rating Count': 212},
           {'Department':'Biology', 'Rating': 3.5424170101403556, 'Rating Count': 2682},
           {'Department':'Curriculum Instruction', 'Rating': 4.116825408405727, 'Rating Count': 30},
           {'Department':'Engineering Mechanics', 'Rating': 3.5782609236413156, 'Rating Count': 46},
           {'Department':'Hotel & Restaurant Management', 'Rating': 3.841666678003713, 'Rating Count': 245},
           {'Department':'Information Science Tech', 'Rating': 3.4685114030796003, 'Rating Count': 1362},
           {'Department':'Labor Employment Relations', 'Rating': 3.9464963271008227, 'Rating Count': 246},
           {'Department':'Landscape Architecture & Regional Planning', 'Rating': 3.57796050333663, 'Rating Count': 95},
           {'Department':'Recreation, Parks, & Tourism', 'Rating': 3.9977112855862407, 'Rating Count': 142}
          ]

In [58]:
df_updates = pd.DataFrame(updates)

In [83]:
df_updates = df_updates.set_index('Department')

In [78]:
df_total = df_total.drop(['Classical  Medieval Studies','Biological Sciences', 'Biology','Classical & Medieval Studies','Classical amp Medieval Studies', 'Curriculum  Instruction', 'Curriculum amp Instruction', 'Ecosystem Science  Management', 'Ecosystem Science & Management', 'Engineering Science  Mechanics', 'Engineering Science amp Mechanics', 'Engineering, Science, & Mechanics', 'Hotel  Restaurant Management', 'Hotel & Restaurant Management', 'Hotel amp Restaurant Management', 'Information Sciences  Technology', 'Information Sciences & Technology', 'Information Science', 'Information Sciences amp Technology', 'Labor  Employment Relations', 'Labor & Employment Relations','Labor amp Employment Relations', 'Landscape Architecture  Regional Planning', 'Landscape Architecture & Regional Planning', 'Recreation Parks  Tourism', 'Recreation Parks amp Tourism', 'Recreation, Parks, & Tourism', 'African-American Studies', 'Africana Studies'])


KeyError: "['Classical  Medieval Studies' 'Biological Sciences' 'Biology'\n 'Classical & Medieval Studies' 'Classical amp Medieval Studies'\n 'Curriculum  Instruction' 'Curriculum amp Instruction'\n 'Ecosystem Science  Management' 'Ecosystem Science & Management'\n 'Engineering Science  Mechanics' 'Engineering Science amp Mechanics'\n 'Engineering, Science, & Mechanics' 'Hotel  Restaurant Management'\n 'Hotel & Restaurant Management' 'Hotel amp Restaurant Management'\n 'Information Sciences  Technology' 'Information Sciences & Technology'\n 'Information Science' 'Information Sciences amp Technology'\n 'Labor  Employment Relations' 'Labor & Employment Relations'\n 'Labor amp Employment Relations'\n 'Landscape Architecture  Regional Planning'\n 'Landscape Architecture & Regional Planning' 'Recreation Parks  Tourism'\n 'Recreation Parks amp Tourism' 'Recreation, Parks, & Tourism'\n 'African-American Studies' 'Africana Studies'] not found in axis"

In [76]:
df_total.shape

(117, 2)

In [91]:
df_updates.shape

(10, 2)

In [87]:
frames1=[df_total,df_updates]

In [88]:
df_total_updated = pd.concat(frames1)

In [92]:
df_total_updated

Unnamed: 0_level_0,Rating,Rating Count
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Accounting,3.621053,1034
Actuarial Science,3.6,19
Aerospace Engineering,3.463636,127
Agriculture,3.909804,331
Animal Science,3.561111,156
Anthropology,3.419565,578
Arabic,4.5,22
Architectural Engineering,4.0,2
Architecture,3.577778,265
Art,3.608511,305


In [110]:
df_total_updated = df_total_updated.reset_index()

In [115]:
df_total_updated = df_total_updated.drop(columns='numbers')

In [130]:
df_total_updated

Unnamed: 0,Department,Rating,Rating Count
0,Accounting,3.621053,1034
1,Actuarial Science,3.6,19
2,Aerospace Engineering,3.463636,127
3,Agriculture,3.909804,331
4,Animal Science,3.561111,156
5,Anthropology,3.419565,578
6,Arabic,4.5,22
7,Architectural Engineering,4.0,2
8,Architecture,3.577778,265
9,Art,3.608511,305


In [118]:
df_total_updated.to_csv('Department_Data.csv')

# Penn State Global Information

In [127]:
df_total_updated['Rating Count'].sum()

65261

In [128]:
df_total_updated['Rating'].mean()

3.7564487833401707

In [129]:
penn_state_global =(3.756449,65261)

In [None]:
##################