# About
https://blog.csdn.net/carlwu/article/details/80017560

In [1]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
import math
import sklearn.metrics as sklm
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('../data/train_dt.csv', index_col='row_id', encoding='utf8')
df_test = pd.read_csv('../data/test_x_dt.csv', index_col='row_id', encoding='utf8')

df_submission = pd.read_csv('../data/submission_format_klDJt5C.csv', index_col='row_id')

---  
## Split train to 80% for training and 20% for validation

In [3]:
train, test = train_test_split(df_train, test_size = 0.2)

print(train.shape)
print(test.shape)

(1120, 55)
(281, 55)


---
## Stepwise selected

In [4]:
def forward_selected(data, response):
    """
    前向逐步回归算法，源代码来自https://planspace.org/20150423-forward_selection_with_statsmodels/
    使用Adjusted R-squared来评判新加的参数是否提高回归中的统计显著性
    Linear model designed by forward selection.
    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response
    response: string, name of response column in data
    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {} + 1".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {} + 1".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
 
    return model

In [5]:
# building model
model = forward_selected(train, 'prevalence_of_undernourishment')

---  
## Model info

In [6]:
# formula
print(model.model.formula)

prevalence_of_undernourishment ~ access_to_improved_water_sources + obesity_prevalence + caloric_energy_from_cereals_roots_tubers + avg_supply_of_protein_of_animal_origin + avg_value_of_food_production + net_oda_received_percent_gni + anemia_prevalence + cereal_import_dependency_ratio + total_population + trade_in_services + political_stability + access_to_electricity + fertility_rate + food_imports_as_share_of_merch_exports + life_expectancy + tax_revenue_share_gdp + year_2000 + percentage_of_arable_land_equipped_for_irrigation + total_land_area + gross_domestic_product_per_capita_ppp + year_2001 + open_defecation + total_labor_force + urban_population + per_capita_food_supply_variability + population_growth + year_2002 + year_2003 + net_oda_received_per_capita + year_2015 + year_2004 + year_2014 + school_enrollment_rate_total + 1


In [7]:
# params
print(model.params, '\n')
print(model.params.count())

Intercept                                            9.017487e+01
access_to_improved_water_sources                    -3.362405e-01
obesity_prevalence                                  -2.825832e-01
caloric_energy_from_cereals_roots_tubers            -4.218417e-01
avg_supply_of_protein_of_animal_origin              -2.655431e-01
avg_value_of_food_production                        -2.162782e-02
net_oda_received_percent_gni                        -1.875953e-02
anemia_prevalence                                   -1.063353e-01
cereal_import_dependency_ratio                      -3.870433e-02
total_population                                    -5.318556e-09
trade_in_services                                    5.630474e-02
political_stability                                 -1.725703e+00
access_to_electricity                               -1.440195e-01
fertility_rate                                      -2.220163e+00
food_imports_as_share_of_merch_exports               1.547511e-02
life_expec

In [8]:
# rsquared
print(model.rsquared_adj)

0.6952024622647044


---  
##  Predict Valuation Data

In [9]:
y_pred = model.predict(test)
y_pred.shape

(281,)

In [10]:
test_y = test['prevalence_of_undernourishment']
test_y.shape

(281,)

In [11]:
print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(test_y, y_pred))))

Root Mean Square Error = 6.693316388652166


---  
##  Predict Test Data

In [12]:
# building model by whole train data
model = forward_selected(df_train, 'prevalence_of_undernourishment')

print(model.rsquared_adj)

0.6845598436059108


In [17]:
stepwise_res = model.predict(df_test)
stepwise_res.shape

(616,)

In [20]:
# export result to csv
df_submission['prevalence_of_undernourishment'] = pd.Series(stepwise_res)
df_submission.to_csv('stepwise_result.csv', index=True)

---  
## Summary