In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np
import scipy

## メダル獲得国の抽出

In [None]:
df_host = pd.read_csv("data/host_country.csv")
years = df_host["年"][:-1]
countries = []
for year in years:
    ### ポイント計算
    try:
        df_1 = pd.read_csv("data/points/{}_point.csv".format(year))
    except:
        df_1 = pd.read_csv("data/points/{}_point.csv".format(year),encoding="shift-jis")
    df_1["point"] = df_1["金"]*5+df_1["銀"]*3+df_1["銅"]
    countries.extend(df_1["国・地域"].values)
countries = list(set(countries))

## 緯度経度

In [None]:
df = pd.read_csv("data/latilong.csv")
cs = []
col = ["namejps","namens","lat","lon"]
for c in countries:
    df_ex = df[df.namejps==c]
    if(len(df_ex)==1):
        cs.append(df_ex[col].values[0])
    else:continue
df_main = pd.DataFrame(cs,columns=col)


## データの結合

In [None]:
# 各種データ読み込み
df_gdp = pd.read_csv("data/gdp.csv")
for k in df_gdp.keys()[1:]:
    df_gdp[k] = df_gdp[k].apply(lambda x: int(x.replace("...","0").replace(",","")))
df_population = pd.read_csv("data/population.csv")
for k in df_population.keys()[1:]:
    df_population[k] = df_population[k].apply(lambda x: int(x.replace("...","0").replace(",","")))
df_temp = pd.read_csv("data/temperature.csv")
# カラムの設定
df_temp = df_temp[["name","Annual"]]
columns = ["国名"]
columns.extend([str(y)+"年" for y in years])
# 切り取り
df_gdp = df_gdp[columns]
df_population = df_population[columns]
data = []
for country in df_main.namens.values:
    res = []
    latilong = df[df.namens==country][["namejps","lat","lon"]]
    gdps = df_gdp[df_gdp["国名"]==country]
    pops = df_population[df_population["国名"]==country]
    temperature = df_temp[df_temp["name"]==country.upper()]
    if len(gdps)!=0:
        res.extend(gdps.values[0])
        res.extend(pops.values[0][1:])
        res.extend(temperature.values[0][1:])#平均気温
        res.extend(latilong.values[0])
        data.append(res)
col = ["国名"]
col.extend([str(y)+"年gdp" for y in years])
col.extend([str(y)+"年pops" for y in years])
col.extend(["temperature","国名(日本)","lati","long"])
df_gdpop = pd.DataFrame(data,columns=col)
df_gdpop[[str(y)+"年gdp" for y in years]] = np.log(df_gdpop[[str(y)+"年gdp" for y in years]]+1) # logバージョン
df_gdpop[[str(y)+"年pops" for y in years]] = np.log(df_gdpop[[str(y)+"年pops" for y in years]]+1) # logバージョン

In [None]:
df_host = pd.read_csv("data/host_country.csv")
# years = df_host["年"]
medal_cs = []
for year in years[:-1]:
    try:
        df_1 = pd.read_csv("data/points/{}_point.csv".format(year))
    except:
        df_1 = pd.read_csv("data/points/{}_point.csv".format(year),encoding="shift-jis")
    medal_cs = df_1["国・地域"].values
    points = []
    for con in df_gdpop["国名(日本)"].values:
        ex = df_1[df_1['国・地域']==con]
        if len(ex) == 0:
            points.append(0)
        else:
            points.append(ex.point.values[0])
    df_gdpop["{}point".format(year)] = points
df_gdpop.head()

## 二地点の距離
https://qiita.com/s-wakaba/items/e12f2a575b6885579df7

In [None]:
from math import sin, cos, acos, radians
earth_rad = 6378.137
def latlng_to_xyz(lat, lng):
    rlat, rlng = radians(lat), radians(lng)
    coslat = cos(rlat)
    return coslat*cos(rlng), coslat*sin(rlng), sin(rlat)


def dist_on_earth(pos0, pos1, radious=earth_rad):
    xyz0 = latlng_to_xyz(pos0[0], pos0[1])
    xyz1 = latlng_to_xyz(pos1[0], pos1[1])
    return acos(sum(x * y for x, y in zip(xyz0, xyz1)))*radious

for y in years:
    dists = []
    pos1s = df_gdpop[["lati","long"]].values
    pos0 = df_host[df_host.年 == y][["緯度","経度"]].values[0]
    for pos1 in pos1s:
#         print(pos0,pos1)
        dists.append(dist_on_earth(pos0,pos1))
    df_gdpop["{}年dist".format(y)] = dists
# df_gdpop

In [None]:
df_gdpop.to_csv("data/main.csv")
df_gdpop.head()

## 正規化定数

In [None]:
## 正規化用
a = df_gdp[[str(y)+"年" for y in years[:-1]]] #通常
a = np.log(df_gdp[[str(y)+"年" for y in years[:-1]]]+1) #log
max_gdp, min_gdp = a.max(),a.min()
mean_gdp, std_gdp = a.mean(),a.std()

# max_pop, min_pop = df_population[[str(y)+"年" for y in years[:-1]]].max(),df_population[[str(y)+"年" for y in years[:-1]]].min()
mean_pop, std_pop = df_population[[str(y)+"年" for y in years[:-1]]].mean(),df_population[[str(y)+"年" for y in years[:-1]]].std() #通常
mean_pop, std_pop = np.log(df_population[[str(y)+"年" for y in years[:-1]]]).mean(),np.log(df_population[[str(y)+"年" for y in years[:-1]]]).std() #log

max_temp, min_temp = df_temp["Annual"].max(),df_temp["Annual"].min()
mean_temp, std_temp = df_temp["Annual"].mean(),df_temp["Annual"].std()

## 70F以上の温度が多すぎるのでcut
# a = df_temp[df_temp.Annual < 70 ].Annual
# mean_temp, std_temp = a.mean(), a.std()

In [None]:
## 年別に抽出
data_frame_all = pd.DataFrame(index=[], columns=["country",'gdp', 'pops',"temp","dist","point"])
for i, year in enumerate(years[:-1]):
    data_frame = pd.DataFrame(index=[], columns=["country",'gdp', 'pops',"temp","dist","point"])
    columns = ["国名","temperature"]
    columns.extend([str(year)+"年gdp",str(year)+"年pops",str(year)+"年dist",str(year)+"point"])
    df_main = df_gdpop[columns]
    
    # 正規化
    data_frame["gdp"] = (df_main[str(year)+"年gdp"]-mean_gdp[str(year)+"年"])/(std_gdp[str(year)+"年"])

    pops = df_main[str(year)+"年pops"]
    data_frame["pops"] = (df_main[str(year)+"年pops"] - mean_pop[str(year)+"年"])/std_pop[str(year)+"年"]
    
    dist = df_main[str(year)+"年dist"]
    data_frame["dist"] = (df_main[str(year)+"年dist"]-dist.mean())/dist.std()
    temp = df_main["temperature"]
    data_frame["temp"] = (df_main["temperature"]-mean_temp)/std_temp
    data_frame.country = df_main["国名"]
    data_frame["point"] = df_main[str(year)+"point"]
    if i == 0:
        data_frame_all = data_frame.copy()
    else:
        data_frame_all = data_frame_all.append(data_frame,ignore_index=True)
df_not0 = data_frame_all[data_frame_all.point!=0]

## 回帰分析

In [None]:
from sklearn.linear_model import LinearRegression
col = ["temp", "gdp","pops","dist"]


clf = LinearRegression()
Y = df_not0["point"].as_matrix()
X = df_not0[col].as_matrix()

clf.fit(X,Y)

df_not0["predict"] = clf.predict(X)
print(clf.intercept_)
pd.DataFrame({"Name":col,
                    "Coeff":clf.coef_}).sort_values(by="Coeff")

In [None]:
country = "Japan"
df_not0[df_not0.country==country]#[["country","point","predict"]]


## GDP等のヒストグラムを見る

In [None]:
# GDP
plt.figure(figsize=(10,5))
log = np.log(df_gdp["2010年"]+1)
log.hist(bins=50)
plt.xlabel("GDP")
plt.ylabel("counts")
plt.title("GDP histogram")

# Temperature
plt.figure(figsize=(10,5))
((df_temp["Annual"]-30)/2).hist(bins=50)
plt.xlabel("Temperature")
plt.ylabel("counts")
plt.title("temperature histogram")
# df_population["2010年"].hist(bins=50)
# df_dist["2010年dist"].hist(bins=10)