### Overview

In [1]:
import pandas as pd
import numpy as np

### Data Wrangling

In this research, the three most essential variables we are going to use are QS ranking (Y), GDP per capita (X) and manufacturing export (IV) of countries. In this section, we will be doing data wrangling to datasets to aquire the variables of interest.

#### step1: head the dataset to have an overview of it

In [2]:
ranking = pd.read_csv('qs-world-university-rankings-2017-to-2022-V2.csv')

In [3]:
ranking.head()

Unnamed: 0,university,year,rank_display,score,link,country,city,region,logo,type,research_output,student_faculty_ratio,international_students,size,faculty_count
0,Massachusetts Institute of Technology (MIT),2017,1,100.0,https://www.topuniversities.com/universities/m...,United States,Cambridge,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,4.0,3730,M,3065
1,Stanford University,2017,2,98.7,https://www.topuniversities.com/universities/s...,United States,Stanford,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,3.0,3879,L,4725
2,Harvard University,2017,3,98.3,https://www.topuniversities.com/universities/h...,United States,Cambridge,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,5.0,5877,L,4646
3,University of Cambridge,2017,4,97.2,https://www.topuniversities.com/universities/u...,United Kingdom,Cambridge,Europe,https://www.topuniversities.com/sites/default/...,Public,Very high,4.0,7925,L,5800
4,California Institute of Technology (Caltech),2017,5,96.9,https://www.topuniversities.com/universities/c...,United States,Pasadena,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,2.0,692,S,968


In [4]:
ranking.dtypes

university                 object
year                        int64
rank_display               object
score                     float64
link                       object
country                    object
city                       object
region                     object
logo                       object
type                       object
research_output            object
student_faculty_ratio     float64
international_students     object
size                       object
faculty_count              object
dtype: object

#### step2: extract top 200 ranking universities of each year and add ranking index to the universities in each year, then aggregate them by country

In [5]:
# create a dictionary to store the top 200 universities for each year
top_200_per_year = {}
# create a list to store the aggregated data frames for all years
aggregated_dataframes_all_years = []
for year_number in range(2017, 2023):  # Include 2022 as well
    # Using boolean indexing to select rows for each year
    data_year=ranking[ranking['year'] == year_number] 
    # Select columns of interest
    data_year = data_year[['university', 'year', 'rank_display', 'country', 'region']]
    # Convert rank_display to a numeric value
    data_year['rank_display'] = pd.to_numeric(data_year['rank_display'], errors='coerce')
    # Sort the data by the numeric rank and keep the top 200 universities
    top_200 = data_year.sort_values('rank_display').head(200)
    # Generate the ranking index in reverse to make sure higher ranking has bigger index
    top_200['ranking index'] = np.arange(200, 0, -1)[:len(top_200)]
    # Store in the top_200_per_year dictionary
    top_200_per_year[year_number] = top_200
    # Sort the ranking index by country and sum them up
    aggregated_data = top_200.groupby('country')['ranking index'].sum().reset_index()
    # Add a year column to the aggregated data
    aggregated_data['year'] = year_number
    # Rearrange columns in the order: country, year, ranking index
    aggregated_data = aggregated_data[['country', 'year', 'ranking index']]
    # Append to the list
    aggregated_dataframes_all_years.append(aggregated_data)

#### step3: combine the data and select countries that appear in all years

In [6]:
# Concatenate all the data frames in the list into a single data frame
combined_aggregated_data = pd.concat(aggregated_dataframes_all_years, ignore_index=True)
# Keep the countries that appear in all six years
countries_in_all_years = combined_aggregated_data['country'].value_counts()
countries_in_all_years = countries_in_all_years[countries_in_all_years == 6].index.tolist()
# Filter the combined_aggregated_data to include only these countries
filtered_aggregated_data = combined_aggregated_data[combined_aggregated_data['country'].isin(countries_in_all_years)]
# The filtered_aggregated_data now contains only the countries that appear in all 6 years (from 2017 to 2022)

In [8]:
filtered_aggregated_data.head()

Unnamed: 0,country,year,ranking index
0,Argentina,2017,116
1,Australia,2017,1110
2,Austria,2017,63
3,Belgium,2017,257
4,Brazil,2017,90


Add covariates from another 2 datasets

In [9]:
variable_data = pd.read_csv('variable_data_copy.csv')

In [10]:
variable_data.head()

Unnamed: 0,Country,Code,ContinentCode,Year,Economic growth: the rate of change of real GDP,Gross Domestic Product billions of 2010 U.S. dollars,Unemployment rate,Exports of goods and services billion USD,Exports of goods and services annual growth,Current account balance billion USD
0,Argentina,ARG,SA,2017,2.82,598.8,8.35,72.86,2.62,-31.15
1,Argentina,ARG,SA,2018,-2.62,583.1,9.22,75.77,0.65,-27.08
2,Argentina,ARG,SA,2019,-2.0,571.5,9.84,80.26,9.75,-3.49
3,Argentina,ARG,SA,2020,-9.94,514.6,11.46,64.04,-17.71,3.12
4,Argentina,ARG,SA,2021,10.4,568.1,8.74,87.87,9.22,6.71


In [11]:
new_variable = pd.read_csv('new_variable_copy.csv')
new_variable.head()

Unnamed: 0,Country,Code,ContinentCode,Year,GDP per capita constant 2010 dollars,Capital investment as percent of GDP,Capital investment billion USD,Household consumption as percent of GDP,Household consumption billion USD,Labor force million people,Government spending as percent of GDP,Government spending billion USD,Population growth percent,Happiness Index 0 (unhappy) - 10 (happy)
0,Argentina,ARG,SA,2017,13595.04,18.21,117.22,66.74,429.55,19.58,17.7,113.9,1.04,6.6
1,Argentina,ARG,SA,2018,13105.4,16.61,87.19,69.47,364.59,20.1,15.81,82.95,1.02,6.39
2,Argentina,ARG,SA,2019,12716.22,14.21,63.63,66.13,296.09,20.61,16.44,73.63,0.99,6.09
3,Argentina,ARG,SA,2020,11341.27,14.13,54.48,63.79,245.94,19.41,16.89,65.12,0.97,5.93
4,Argentina,ARG,SA,2021,12402.49,17.47,85.14,60.89,296.69,21.19,15.84,77.19,0.95,5.97


In [25]:
#make sure the column names align with the ones in filtered_aggregated_data
variable_data.rename(columns={'Country': 'country'}, inplace=True)
variable_data.rename(columns={'Year': 'year'}, inplace=True)
new_variable.rename(columns={'Country': 'country'}, inplace=True)
new_variable.rename(columns={'Year': 'year'}, inplace=True)

In [14]:
variable_data['country'] = variable_data['country'].replace({
    'China': 'China (Mainland)',
    'Hong Kong': 'Hong Kong SAR',
    'USA': 'United States'
})

In [15]:
new_variable['country'] = new_variable['country'].replace({
    'China': 'China (Mainland)',
    'Hong Kong': 'Hong Kong SAR',
    'USA': 'United States'
})

In [26]:
# Merge the two dataframes
merged_data = pd.merge(variable_data, filtered_aggregated_data, on=['country', 'year'], how='left')
merged_data = pd.merge(new_variable, merged_data, on=['country', 'year',"Code","ContinentCode"], how='left')

### merged_data dataframe will now contain all columns of variable_data and new_variable, plus 'ranking index' from filtered_aggregated_data for each country and each year

In [27]:
merged_data.head()

Unnamed: 0,country,Code,ContinentCode,year,GDP per capita constant 2010 dollars,Capital investment as percent of GDP,Capital investment billion USD,Household consumption as percent of GDP,Household consumption billion USD,Labor force million people,...,Government spending billion USD,Population growth percent,Happiness Index 0 (unhappy) - 10 (happy),Economic growth: the rate of change of real GDP,Gross Domestic Product billions of 2010 U.S. dollars,Unemployment rate,Exports of goods and services billion USD,Exports of goods and services annual growth,Current account balance billion USD,ranking index
0,Argentina,ARG,SA,2017,13595.04,18.21,117.22,66.74,429.55,19.58,...,113.9,1.04,6.6,2.82,598.8,8.35,72.86,2.62,-31.15,116
1,Argentina,ARG,SA,2018,13105.4,16.61,87.19,69.47,364.59,20.1,...,82.95,1.02,6.39,-2.62,583.1,9.22,75.77,0.65,-27.08,126
2,Argentina,ARG,SA,2019,12716.22,14.21,63.63,66.13,296.09,20.61,...,73.63,0.99,6.09,-2.0,571.5,9.84,80.26,9.75,-3.49,128
3,Argentina,ARG,SA,2020,11341.27,14.13,54.48,63.79,245.94,19.41,...,65.12,0.97,5.93,-9.94,514.6,11.46,64.04,-17.71,3.12,128
4,Argentina,ARG,SA,2021,12402.49,17.47,85.14,60.89,296.69,21.19,...,77.19,0.95,5.97,10.4,568.1,8.74,87.87,9.22,6.71,133


In [28]:
merged_data.to_csv('merged_data.csv', index=False)

### Regression models

In [29]:
# Check for na values in 'ranking index'
nan_exists = merged_data['ranking index'].isna().any()
print(f"Are there NaN values in 'ranking index'? {nan_exists}")
# Check for inf (infinite) values in 'ranking index'
inf_exists = merged_data['ranking index'].replace([np.inf, -np.inf], np.nan).isna().any()
print(f"Are there inf values in 'ranking index'? {inf_exists}")

Are there NaN values in 'ranking index'? False
Are there inf values in 'ranking index'? False


In [22]:
pip install statsmodels

[0mNote: you may need to restart the kernel to use updated packages.


In [23]:
# Specify the columns of interest
columns_of_interest = ['Economic growth: the rate of change of real GDP', 
                       'Gross Domestic Product billions of 2010 U.S. dollars', 
                       'Unemployment rate', 
                       'Imports of goods and services billion USD', 
                       'Current account balance billion USD']

# Iterate through each column and identify the indices of NaN and inf values
for column in columns_of_interest:
    # NaN values
    nan_indices = merged_data[column][merged_data[column].isna()].index
    print(f"Indices of NaN values in '{column}': {nan_indices.tolist()}")

    # Inf values
    inf_indices = merged_data[column][np.isinf(merged_data[column])].index
    print(f"Indices of inf values in '{column}': {inf_indices.tolist()}")


Indices of NaN values in 'Economic growth: the rate of change of real GDP': []
Indices of inf values in 'Economic growth: the rate of change of real GDP': []
Indices of NaN values in 'Gross Domestic Product billions of 2010 U.S. dollars': []
Indices of inf values in 'Gross Domestic Product billions of 2010 U.S. dollars': []
Indices of NaN values in 'Unemployment rate': [143]
Indices of inf values in 'Unemployment rate': []


KeyError: 'Imports of goods and services billion USD'

In [None]:
# Fill in the NaN value at index 143 in the 'Unemployment rate' column with 4.3
merged_data.loc[143, 'Unemployment rate'] = 4.3

# This will replace the NaN value at index 143 in the 'Unemployment rate' column with 4.3


In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS

# Assuming your merged data is loaded into a DataFrame named 'merged_data'
# merged_data = pd.read_csv('path_to_your_merged_data.csv')  # Load your merged data

# Define the dependent variable
y = merged_data['ranking index']

# Define the independent variables and covariates
X = merged_data[['Economic growth: the rate of change of real GDP', 'Gross Domestic Product billions of 2010 U.S. dollars', 
                 'Unemployment rate', 'Imports of goods and services billion USD', 
                 'Current account balance billion USD']]

# Add a constant to the independent variables (if needed)
X = sm.add_constant(X)

# Define the instrumental variable
instrument = merged_data['Exports of goods and services billion USD']

# Perform the 2SLS regression
model = IV2SLS(y, X, instrument).fit()

# Print the summary of the regression results
print(model.summary())

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS

# Assuming your merged data is loaded into a DataFrame named 'merged_data'
# merged_data = pd.read_csv('path_to_your_merged_data.csv')  # Load your merged data

# Define the dependent variable
y = merged_data['ranking index']

# Define the exogenous variables and covariates
X = merged_data[['Economic growth: the rate of change of real GDP', 
                 'Unemployment rate', 'Imports of goods and services billion USD', ]]

# Define the endogenous variable
endog = merged_data['Gross Domestic Product billions of 2010 U.S. dollars']
X = sm.add_constant(X.join(endog))  # Add a constant and include the endogenous variable

# Define the instrumental variable
instrument = merged_data['Exports of goods and services billion USD']

# Perform the 2SLS regression
model = IV2SLS(endog=y, exog=X, instrument=instrument).fit()

# Print the summary of the regression results
print(model.summary())

In [None]:
import pandas as pd
import statsmodels.api as sm

# Assuming your data is already loaded into a DataFrame named 'merged_data'
# merged_data = pd.read_csv('path_to_your_data.csv')  # Load your data

# Define the dependent variable (y) and independent variable (x)
y = merged_data['Economic growth: the rate of change of real GDP']
X = merged_data['Exports of goods and services billion USD']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())

In [None]:
y = merged_data['Economic growth: the rate of change of real GDP']
X = merged_data['Exports of goods and services annual growth']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())

In [None]:
import pandas as pd
import statsmodels.api as sm

# Assuming your data is already loaded into a DataFrame named 'merged_data'
# merged_data = pd.read_csv('path_to_your_data.csv')  # Load your data

# Define the dependent variable (y)
y = merged_data['ranking index']

# Define the independent variables (X)
X = merged_data[['Economic growth: the rate of change of real GDP', 
                 'Exports of goods and services annual growth', 
                 'Gross Domestic Product billions of 2010 U.S. dollars'
                 'Unemployment rate']]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())

In [None]:
y = merged_data['Economic growth: the rate of change of real GDP']
X = merged_data['Exports of goods and services annual growth']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())