In [2]:
import pandas as pd
import numpy as np

### Data Wrangling

In this research, the three most essential data sets we are going to use are QS ranking (y), GDP per capita (x) and manufacturing export (IV) of countries. In this section, we will be doing data wrangling to each data set.

#### 1. ranking data (y)

step1: read and overview of dataset

In [3]:
ranking = pd.read_csv('qs-world-university-rankings-2017-to-2022-V2.csv')

In [4]:
ranking.head()

Unnamed: 0,university,year,rank_display,score,link,country,city,region,logo,type,research_output,student_faculty_ratio,international_students,size,faculty_count
0,Massachusetts Institute of Technology (MIT),2017,1,100.0,https://www.topuniversities.com/universities/m...,United States,Cambridge,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,4.0,3730,M,3065
1,Stanford University,2017,2,98.7,https://www.topuniversities.com/universities/s...,United States,Stanford,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,3.0,3879,L,4725
2,Harvard University,2017,3,98.3,https://www.topuniversities.com/universities/h...,United States,Cambridge,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,5.0,5877,L,4646
3,University of Cambridge,2017,4,97.2,https://www.topuniversities.com/universities/u...,United Kingdom,Cambridge,Europe,https://www.topuniversities.com/sites/default/...,Public,Very high,4.0,7925,L,5800
4,California Institute of Technology (Caltech),2017,5,96.9,https://www.topuniversities.com/universities/c...,United States,Pasadena,North America,https://www.topuniversities.com/sites/default/...,Private,Very High,2.0,692,S,968


In [5]:
ranking.dtypes

university                 object
year                        int64
rank_display               object
score                     float64
link                       object
country                    object
city                       object
region                     object
logo                       object
type                       object
research_output            object
student_faculty_ratio     float64
international_students     object
size                       object
faculty_count              object
dtype: object

step2: extract top 200 ranking universities of each year and add ranking index to the data of each year, then aggregate them by country

step3: combine ranking index of country of all years to a single dataframe

In [6]:
# Initialize a dictionary to hold the top 200 universities for each year
top_200_per_year = {}

# Initialize a list to hold the aggregated data frames for all years
aggregated_dataframes_all_years = []

for year_number in range(2017, 2023):  # Include 2022 as well
    # Filter the data for the specific year
    data_year = ranking[ranking['year'] == year_number]

    # Select relevant columns
    data_year = data_year[['university', 'year', 'rank_display', 'country', 'region']]

    # Convert rank_display to a numeric value, handling non-numeric ranks
    data_year['rank_display_numeric'] = pd.to_numeric(data_year['rank_display'], errors='coerce')

    # Sort the data by the numeric rank and filter the top 200 universities
    top_200 = data_year.sort_values('rank_display_numeric').head(200)

    # Generate the ranking index in reverse (from 200 to 1)
    top_200['ranking index'] = np.arange(200, 0, -1)[:len(top_200)]

    # Store in the top_200_per_year dictionary
    top_200_per_year[year_number] = top_200

    # Aggregate the ranking index by country and sum them up
    aggregated_data = top_200.groupby('country')['ranking index'].sum().reset_index()

    # Add a year column to the aggregated data
    aggregated_data['year'] = year_number

    # Rearrange columns to the order: country, year, ranking index
    aggregated_data = aggregated_data[['country', 'year', 'ranking index']]

    # Append to the list
    aggregated_dataframes_all_years.append(aggregated_data)

# Concatenate all the data frames in the list into a single data frame
combined_aggregated_data = pd.concat(aggregated_dataframes_all_years, ignore_index=True)

# Identify countries that appear in each year
countries_in_all_years = combined_aggregated_data['country'].value_counts()
countries_in_all_years = countries_in_all_years[countries_in_all_years == 6].index.tolist()

# Filter the combined_aggregated_data to include only these countries
filtered_aggregated_data = combined_aggregated_data[combined_aggregated_data['country'].isin(countries_in_all_years)]

# The filtered_aggregated_data now contains only the countries that appear in each year from 2017 to 2022

In [7]:
filtered_aggregated_data.head()

Unnamed: 0,country,year,ranking index
0,Argentina,2017,116
1,Australia,2017,1110
2,Austria,2017,63
3,Belgium,2017,257
4,Brazil,2017,90


In [7]:
countries_in_all_years

['Argentina',
 'Singapore',
 'Mexico',
 'Netherlands',
 'New Zealand',
 'Norway',
 'Russia',
 'Saudi Arabia',
 'South Korea',
 'Japan',
 'Spain',
 'Sweden',
 'Switzerland',
 'Taiwan',
 'United Kingdom',
 'United States',
 'Australia',
 'Malaysia',
 'Italy',
 'Israel',
 'Austria',
 'Belgium',
 'Brazil',
 'Canada',
 'Chile',
 'China (Mainland)',
 'Denmark',
 'Finland',
 'France',
 'Germany',
 'Hong Kong SAR',
 'India',
 'Ireland']

In [8]:
variable_data = pd.read_csv('variable_data.csv')

In [9]:
variable_data.head()

Unnamed: 0,Country,Code,ContinentCode,Year,Economic growth: the rate of change of real GDP,Gross Domestic Product billions of 2010 U.S. dollars,Unemployment rate,Exports of goods and services billion USD,Exports of goods and services annual growth,Imports of goods and services billion USD,Current account balance billion USD,Research and development expenditure percent of GDP,Public spending on education percent of GDP
0,Argentina,ARG,SA,2017,2.82,598.8,8.35,72.86,2.62,89.91,-31.15,0.56,5.45
1,Argentina,ARG,SA,2018,-2.62,583.1,9.22,75.77,0.65,85.68,-27.08,0.49,4.88
2,Argentina,ARG,SA,2019,-2.0,571.5,9.84,80.26,9.75,65.85,-3.49,0.46,4.72
3,Argentina,ARG,SA,2020,-9.94,514.6,11.46,64.04,-17.71,52.47,3.12,,5.02
4,Argentina,ARG,SA,2021,10.4,568.1,8.74,87.87,9.22,72.82,6.71,,


In [10]:
variable_data.rename(columns={'Country': 'country'}, inplace=True)
variable_data.rename(columns={'Year': 'year'}, inplace=True)

In [18]:
variable_data['country'] = variable_data['country'].replace({
    'China': 'China (Mainland)',
    'Hong Kong': 'Hong Kong SAR',
    'USA': 'United States'
})

In [19]:
# Assuming filtered_aggregated_data is already prepared as per previous steps



# Merge the two dataframes
merged_data = pd.merge(variable_data, filtered_aggregated_data, on=['country', 'year'], how='left')

# The merged_data dataframe will now contain all columns from variable_data plus the 'ranking index' from filtered_aggregated_data

In [20]:
merged_data.head()

Unnamed: 0,country,Code,ContinentCode,year,Economic growth: the rate of change of real GDP,Gross Domestic Product billions of 2010 U.S. dollars,Unemployment rate,Exports of goods and services billion USD,Exports of goods and services annual growth,Imports of goods and services billion USD,Current account balance billion USD,Research and development expenditure percent of GDP,Public spending on education percent of GDP,ranking index
0,Argentina,ARG,SA,2017,2.82,598.8,8.35,72.86,2.62,89.91,-31.15,0.56,5.45,116
1,Argentina,ARG,SA,2018,-2.62,583.1,9.22,75.77,0.65,85.68,-27.08,0.49,4.88,126
2,Argentina,ARG,SA,2019,-2.0,571.5,9.84,80.26,9.75,65.85,-3.49,0.46,4.72,128
3,Argentina,ARG,SA,2020,-9.94,514.6,11.46,64.04,-17.71,52.47,3.12,,5.02,128
4,Argentina,ARG,SA,2021,10.4,568.1,8.74,87.87,9.22,72.82,6.71,,,133


In [21]:
nan_exists = merged_data['ranking index'].isna().any()
print(f"Are there NaN values in 'ranking index'? {nan_exists}")

# Check for inf (infinite) values in 'ranking index'
inf_exists = merged_data['ranking index'].replace([np.inf, -np.inf], np.nan).isna().any()
print(f"Are there inf values in 'ranking index'? {inf_exists}")

Are there NaN values in 'ranking index'? False
Are there inf values in 'ranking index'? False


In [22]:
import pandas as pd
import numpy as np

# Assuming your merged_data DataFrame is already loaded
# merged_data = pd.read_csv('path_to_your_merged_data.csv')  # Load your merged data

# Identify the indices of NaN (Not a Number) values in 'ranking index'
nan_indices = merged_data['ranking index'][merged_data['ranking index'].isna()].index
print(f"Indices of NaN values in 'ranking index': {nan_indices.tolist()}")

# Identify the indices of inf (infinite) values in 'ranking index'
inf_indices = merged_data['ranking index'][merged_data['ranking index'].replace([np.inf, -np.inf], np.nan).isna()].index
print(f"Indices of inf values in 'ranking index': {inf_indices.tolist()}")

Indices of NaN values in 'ranking index': []
Indices of inf values in 'ranking index': []


In [23]:
pip install statsmodels

[0mNote: you may need to restart the kernel to use updated packages.


In [28]:
import pandas as pd
import numpy as np

# Assuming your merged_data DataFrame is already loaded
# merged_data = pd.read_csv('path_to_your_merged_data.csv')  # Load your merged data

# Specify the columns of interest
columns_of_interest = ['Economic growth: the rate of change of real GDP', 
                       'Gross Domestic Product billions of 2010 U.S. dollars', 
                       'Unemployment rate', 
                       'Imports of goods and services billion USD', 
                       'Current account balance billion USD']

# Iterate through each column and identify the indices of NaN and inf values
for column in columns_of_interest:
    # NaN values
    nan_indices = merged_data[column][merged_data[column].isna()].index
    print(f"Indices of NaN values in '{column}': {nan_indices.tolist()}")

    # Inf values
    inf_indices = merged_data[column][np.isinf(merged_data[column])].index
    print(f"Indices of inf values in '{column}': {inf_indices.tolist()}")


Indices of NaN values in 'Economic growth: the rate of change of real GDP': []
Indices of inf values in 'Economic growth: the rate of change of real GDP': []
Indices of NaN values in 'Gross Domestic Product billions of 2010 U.S. dollars': []
Indices of inf values in 'Gross Domestic Product billions of 2010 U.S. dollars': []
Indices of NaN values in 'Unemployment rate': []
Indices of inf values in 'Unemployment rate': []
Indices of NaN values in 'Imports of goods and services billion USD': []
Indices of inf values in 'Imports of goods and services billion USD': []
Indices of NaN values in 'Current account balance billion USD': []
Indices of inf values in 'Current account balance billion USD': []


In [27]:
# Fill in the NaN value at index 143 in the 'Unemployment rate' column with 4.3
merged_data.loc[143, 'Unemployment rate'] = 4.3

# This will replace the NaN value at index 143 in the 'Unemployment rate' column with 4.3


In [50]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS

# Assuming your merged data is loaded into a DataFrame named 'merged_data'
# merged_data = pd.read_csv('path_to_your_merged_data.csv')  # Load your merged data

# Define the dependent variable
y = merged_data['ranking index']

# Define the independent variables and covariates
X = merged_data[['Economic growth: the rate of change of real GDP', 'Gross Domestic Product billions of 2010 U.S. dollars', 
                 'Unemployment rate', 'Imports of goods and services billion USD', 
                 'Current account balance billion USD']]

# Add a constant to the independent variables (if needed)
X = sm.add_constant(X)

# Define the instrumental variable
instrument = merged_data['Exports of goods and services billion USD']

# Perform the 2SLS regression
model = IV2SLS(y, X, instrument).fit()

# Print the summary of the regression results
print(model.summary())

                          IV2SLS Regression Results                           
Dep. Variable:          ranking index   R-squared:                   -7687.509
Model:                         IV2SLS   Adj. R-squared:              -7894.190
Method:                     Two Stage   F-statistic:                 4.089e-17
                        Least Squares   Prob (F-statistic):               1.00
Date:                Sun, 12 Nov 2023                                         
Time:                        08:56:22                                         
No. Observations:                 192                                         
Df Residuals:                     186                                         
Df Model:                           5                                         
                                                           coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------

  return np.sqrt(np.diag(self.cov_params()))


In [54]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.sandbox.regression.gmm import IV2SLS

# Assuming your merged data is loaded into a DataFrame named 'merged_data'
# merged_data = pd.read_csv('path_to_your_merged_data.csv')  # Load your merged data

# Define the dependent variable
y = merged_data['ranking index']

# Define the exogenous variables and covariates
X = merged_data[['Economic growth: the rate of change of real GDP', 
                 'Unemployment rate', 'Imports of goods and services billion USD', ]]

# Define the endogenous variable
endog = merged_data['Gross Domestic Product billions of 2010 U.S. dollars']
X = sm.add_constant(X.join(endog))  # Add a constant and include the endogenous variable

# Define the instrumental variable
instrument = merged_data['Exports of goods and services billion USD']

# Perform the 2SLS regression
model = IV2SLS(endog=y, exog=X, instrument=instrument).fit()

# Print the summary of the regression results
print(model.summary())

                          IV2SLS Regression Results                           
Dep. Variable:          ranking index   R-squared:                    -126.475
Model:                         IV2SLS   Adj. R-squared:               -129.201
Method:                     Two Stage   F-statistic:                 1.004e-15
                        Least Squares   Prob (F-statistic):               1.00
Date:                Sun, 12 Nov 2023                                         
Time:                        09:01:43                                         
No. Observations:                 192                                         
Df Residuals:                     187                                         
Df Model:                           4                                         
                                                           coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------

In [30]:
import pandas as pd
import statsmodels.api as sm

# Assuming your data is already loaded into a DataFrame named 'merged_data'
# merged_data = pd.read_csv('path_to_your_data.csv')  # Load your data

# Define the dependent variable (y) and independent variable (x)
y = merged_data['Economic growth: the rate of change of real GDP']
X = merged_data['Exports of goods and services billion USD']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())

                                           OLS Regression Results                                          
Dep. Variable:     Economic growth: the rate of change of real GDP   R-squared:                       0.012
Model:                                                         OLS   Adj. R-squared:                  0.007
Method:                                              Least Squares   F-statistic:                     2.321
Date:                                             Sun, 12 Nov 2023   Prob (F-statistic):              0.129
Time:                                                     08:36:09   Log-Likelihood:                -537.39
No. Observations:                                              192   AIC:                             1079.
Df Residuals:                                                  190   BIC:                             1085.
Df Model:                                                        1                                         
Covariance Type:            

In [31]:
y = merged_data['Economic growth: the rate of change of real GDP']
X = merged_data['Exports of goods and services annual growth']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())

                                           OLS Regression Results                                          
Dep. Variable:     Economic growth: the rate of change of real GDP   R-squared:                       0.607
Model:                                                         OLS   Adj. R-squared:                  0.605
Method:                                              Least Squares   F-statistic:                     293.3
Date:                                             Sun, 12 Nov 2023   Prob (F-statistic):           2.25e-40
Time:                                                     08:37:49   Log-Likelihood:                -448.93
No. Observations:                                              192   AIC:                             901.9
Df Residuals:                                                  190   BIC:                             908.4
Df Model:                                                        1                                         
Covariance Type:            

In [None]:
import pandas as pd
import statsmodels.api as sm

# Assuming your data is already loaded into a DataFrame named 'merged_data'
# merged_data = pd.read_csv('path_to_your_data.csv')  # Load your data

# Define the dependent variable (y)
y = merged_data['ranking index']

# Define the independent variables (X)
X = merged_data[['Economic growth: the rate of change of real GDP', 
                 'Exports of goods and services annual growth', 
                 'Gross Domestic Product billions of 2010 U.S. dollars'
                 'Unemployment rate']]

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())

In [None]:
y = merged_data['Economic growth: the rate of change of real GDP']
X = merged_data['Exports of goods and services annual growth']

# Add a constant to the independent variable
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression results
print(model.summary())