# Analyze country-year dataset

Jenna Jordan

## Resource

Introduction to Machine Learning with Python, by Andreas Müller and Sarah Guido - [link](https://learning.oreilly.com/library/view/introduction-to-machine/9781449369880/)

In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn
%matplotlib inline
import matplotlib.pyplot as plt
from scipy import sparse
from IPython.display import display

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../Data/FINAL/final_time-series.csv")

In [3]:
df.columns

Index(['country', 'year', 'continent', 'region', 'iso3c', 'cow_id', 'gw_id',
       'p4_id', 'wb_id', 'ucdp_conflictCount', 'ucdp_intensity_War',
       'ucdp_intensity_Minor', 'ucdp_incompatibility_isTerritory',
       'ucdp_incompatibility_isGovernment', 'ucdp_type_Extrasystemic',
       'ucdp_type_Internal', 'ucdp_type_InternationalizedInternal',
       'ucdp_type_Interstate', 'ucdp_role_primary', 'ucdp_role_secondary',
       'cow_warCount', 'cow_IsInitiator', 'cow_TimeAtWar', 'cow_IsSameRegion',
       'cow_type_ExtraState', 'cow_type_InterState', 'cow_type_IntraState',
       'cow_type_NonState', 'cow_status_compromised',
       'cow_status_contConflict', 'cow_status_lost', 'cow_status_ongoing',
       'cow_status_stalemate', 'cow_status_transWarType', 'cow_status_won',
       'p4_fragment', 'p4_democ', 'p4_autoc', 'p4_polity', 'p4_polity2',
       'p4_durable', 'p4_xrreg', 'p4_xrcomp', 'p4_xropen', 'p4_xconst',
       'p4_parreg', 'p4_parcomp', 'p4_exrec', 'p4_exconst', 'p4_polc

In [4]:
df

Unnamed: 0,country,year,continent,region,iso3c,cow_id,gw_id,p4_id,wb_id,ucdp_conflictCount,...,wb_SI.POV.GINI,wb_SL.UEM.TOTL.ZS,wb_SP.DYN.IMRT.IN,wb_SP.DYN.LE00.IN,wb_SP.DYN.TFRT.IN,wb_SP.POP.GROW,wb_SP.POP.TOTL,wb_SP.POP.TOTL.FE.IN,wb_SP.URB.TOTL.IN.ZS,wb_ST.INT.ARVL
0,Afghanistan,1946,Asia,Southern Asia,AFG,700.0,700.0,700.0,AFG,0.0,...,,,,,,,,,,
1,Afghanistan,1947,Asia,Southern Asia,AFG,700.0,700.0,700.0,AFG,0.0,...,,,,,,,,,,
2,Afghanistan,1948,Asia,Southern Asia,AFG,700.0,700.0,700.0,AFG,0.0,...,,,,,,,,,,
3,Afghanistan,1949,Asia,Southern Asia,AFG,700.0,700.0,700.0,AFG,0.0,...,,,,,,,,,,
4,Afghanistan,1950,Asia,Southern Asia,AFG,700.0,700.0,700.0,AFG,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11254,Zimbabwe,2014,Africa,Eastern Africa,ZWE,552.0,552.0,552.0,ZWE,0.0,...,,5.518,40.4,58.410,3.974,1.754692,13586681.0,7126780.0,32.504,1880000.0
11255,Zimbabwe,2015,Africa,Eastern Africa,ZWE,552.0,552.0,552.0,ZWE,0.0,...,,5.438,38.5,59.534,3.896,1.663813,13814629.0,7245857.0,32.385,2057000.0
11256,Zimbabwe,2016,Africa,Eastern Africa,ZWE,552.0,552.0,552.0,ZWE,0.0,...,,5.239,36.3,60.294,3.804,1.549759,14030390.0,7356159.0,32.296,2168000.0
11257,Zimbabwe,2017,Africa,Eastern Africa,ZWE,552.0,552.0,552.0,ZWE,0.0,...,,4.943,35.4,60.812,3.707,1.460061,14236745.0,7459621.0,32.237,2423000.0


In [5]:
df['IsWar'] = 0
df.loc[df['ucdp_intensity_War'] > 0, 'IsWar'] = 1
df.loc[df['cow_warCount'] > 0, 'IsWar'] = 1

## Regression

In [6]:
df_econ = df[['country', 'year', 'p4_polity2', 'IsWar',
              'wb_BX.KLT.DINV.CD.WD', 'wb_EG.ELC.ACCS.ZS', 'wb_EN.ATM.CO2E.PC', 'wb_EN.POP.DNST', 'wb_FP.CPI.TOTL.ZG', 'wb_MS.MIL.XPND.GD.ZS',
              'wb_NE.EXP.GNFS.ZS', 'wb_NY.GDP.MKTP.CD', 'wb_NY.GDP.MKTP.KD.ZG', 'wb_NY.GDP.PCAP.CD', 'wb_NY.GDP.PCAP.KD.ZG', 'wb_NY.GDP.PCAP.PP.CD',
              'wb_SE.ADT.LITR.ZS', 'wb_SE.XPD.TOTL.GB.ZS', 'wb_SH.XPD.CHEX.GD.ZS', 'wb_SI.POV.GINI', 'wb_SL.UEM.TOTL.ZS', 'wb_SP.DYN.IMRT.IN',
              'wb_SP.DYN.LE00.IN', 'wb_SP.DYN.TFRT.IN', 'wb_SP.POP.GROW','wb_SP.POP.TOTL', 'wb_SP.POP.TOTL.FE.IN', 'wb_SP.URB.TOTL.IN.ZS', 'wb_ST.INT.ARVL']].copy()

In [11]:
df_econ2000 = df_econ[df_econ.year==2000].set_index(['country', 'year'])

Decades are the best years for having few null values in WDI variables, and CoW only tracks conflicts until 2008. So 2000 is a good year to select.

In [12]:
df_econ2000.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
p4_polity2,161.0,2.944099,6.566247,-10.0,-3.0,5.0,9.0,10.0
IsWar,191.0,0.1256545,0.3323306,0.0,0.0,0.0,0.0,1.0
wb_BX.KLT.DINV.CD.WD,180.0,7708850000.0,35104640000.0,-4550355000.0,25581750.0,154830200.0,995482200.0,350066000000.0
wb_EG.ELC.ACCS.ZS,157.0,73.40171,33.94459,2.844947,43.7,94.3933,100.0,100.0
wb_EN.ATM.CO2E.PC,186.0,4.478298,6.755429,0.01728181,0.4827556,1.98144,6.249414,58.61946
wb_EN.POP.DNST,189.0,241.1646,1246.912,1.543177,27.11216,68.86041,139.1277,16073.5
wb_FP.CPI.TOTL.ZG,155.0,13.60312,51.1194,-3.846154,1.628332,3.555413,8.10207,513.9068
wb_MS.MIL.XPND.GD.ZS,144.0,2.472064,3.110636,0.0,1.153464,1.728122,2.929364,32.65567
wb_NE.EXP.GNFS.ZS,167.0,38.98422,26.01131,0.5388379,21.58823,35.00888,49.56145,188.3509
wb_NY.GDP.MKTP.CD,184.0,178880400000.0,867837600000.0,13742060.0,1784657000.0,8945209000.0,56505060000.0,10252350000000.0


In [13]:
df_econ2000 = df_econ2000.drop(columns=['wb_SE.ADT.LITR.ZS', 'wb_SI.POV.GINI', 'wb_SE.XPD.TOTL.GB.ZS'])

### Linear Regression

Let's try to predict a country's polity score based on their economic indicators

In [14]:
df_econ2000 = df_econ2000.dropna(subset=['p4_polity2'])

fill missing values with column mean

In [15]:
df_econ2000 = df_econ2000.apply(lambda x: x.fillna(x.mean()),axis=0)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    df_econ2000.iloc[:,2:], df_econ2000.iloc[:,0], random_state=0)

In [21]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression().fit(X_train, y_train)

In [22]:
print("lr.coef_:", lr.coef_)
print("lr.intercept_:", lr.intercept_)

lr.coef_: [-2.53701070e-12 -4.29484619e-02 -1.28455703e-02 -4.01843234e-04
  2.23022670e-03 -3.32710241e-01 -1.70260260e-02 -5.32227196e-13
 -1.05376157e+00  3.80687560e-04  1.00043816e+00 -3.11573421e-04
  6.27891711e-01 -1.35630408e-01 -2.00940422e-02 -4.60868772e-02
 -1.23001065e+00  6.19528919e-01  2.70747266e-07 -5.64219227e-07
  7.51185292e-02  1.04334002e-07]
lr.intercept_: 11.094200315120927


In [23]:
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

Training set score: 0.50
Test set score: 0.20


This model is really bad - it overfit on the training set, and has poor predictive capabilities

In [24]:
from sklearn.linear_model import Ridge

ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

Training set score: 0.50
Test set score: 0.20


  return linalg.solve(A, Xy, sym_pos=True,
