## Import data

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.manifold import TSNE
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, LabelSet
import pycountry_convert as pc
output_notebook()

In [2]:
dims = pd.read_csv("6-dimensions_new.csv")
print(dims.head())
print("Nb lignes : ",len(dims.index))
dims = dims.dropna()
dims = dims.drop([0,1,5])
#supprimer regions
print("Nb lignes filtrées : ",len(dims.index))
print("Index : ",dims.columns)
print(dims.head(10))

   Unnamed: 0  ctr      country   pdi   idv   mas   uai     ltowvs        ivr
0           0  AFE  Africa East  64.0  27.0  41.0  52.0  32.000000  40.000000
1           1  AFW  Africa West  77.0  20.0  46.0  54.0   9.000000  78.000000
2           2  ALB      Albania   NaN   NaN   NaN   NaN  61.460957  14.508929
3           3  ALG      Algeria   NaN   NaN   NaN   NaN  25.944584  32.366071
4           4  AND      Andorra   NaN   NaN   NaN   NaN        NaN  65.000000
Nb lignes :  111
Nb lignes filtrées :  62
Index :  Index(['Unnamed: 0', 'ctr', 'country', 'pdi', 'idv', 'mas', 'uai', 'ltowvs',
       'ivr'],
      dtype='object')
    Unnamed: 0  ctr     country   pdi   idv   mas   uai     ltowvs        ivr
6            6  ARG   Argentina  49.0  46.0  56.0  86.0  20.403023  61.830357
8            8  AUS   Australia  38.0  90.0  61.0  51.0  21.158690  71.428571
9            9  AUT     Austria  11.0  55.0  79.0  70.0  60.453401  62.723214
11          11  BGD  Bangladesh  80.0  20.0  55.0  60.0

In [3]:
model = SpectralClustering(n_clusters=4,assign_labels="discretize",affinity='nearest_neighbors',random_state=0)
X = dims[['pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]
y = dims["country"]
clust_labels = model.fit_predict(X)
#clust_labels = model.predict(X)
#cent = model.cluster_centers_
#plt.scatter(mat[:,0],mat[:,1],c=clust_labels)
#for i, txt in enumerate(y):
#    plt.annotate(txt, (mat[i,0], mat[i,1]))

In [4]:
pca = TSNE(n_components=2,random_state=0,perplexity=20)
mat = pca.fit_transform(X,y)
p = figure(title = "Clustering TSNE")
#p.xaxis.axis_label = 'Petal Length'
#p.yaxis.axis_label = 'Petal Width'
colors = ["red","blue","green","yellow","black","brown"]
countries = [pc.country_alpha2_to_continent_code(pc.country_name_to_country_alpha2(x)) for x in dims["country"]]
#countries = [np.unique(countries).index(x) for x in countries]
uniq = list(np.unique(countries))
conts = [uniq.index(x) for x in countries]
#for i,con in zip(list(range(len(np.unique(countries)))),np.unique(countries)):
#    print(np.where(countries==con))
#    conts[np.where(countries==con)] = i
    
source = ColumnDataSource(data={"x":mat[:,0], "y":mat[:,1], "label":y, 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

#output_file("iris.html", title="iris.py example")

show(p)

In [5]:
pca = PCA(n_components=2,random_state=0)
mat = pca.fit_transform(X,y)
p = figure(title = "Clustering PCA : "+str(int(sum(pca.explained_variance_ratio_)*100))+"%")
#p.xaxis.axis_label = 'Petal Length'
#p.yaxis.axis_label = 'Petal Width'
colors = ["red","blue","green","yellow","black","brown"]
countries = [pc.country_alpha2_to_continent_code(pc.country_name_to_country_alpha2(x)) for x in dims["country"]]
#countries = [np.unique(countries).index(x) for x in countries]
uniq = list(np.unique(countries))
conts = [uniq.index(x) for x in countries]
#for i,con in zip(list(range(len(np.unique(countries)))),np.unique(countries)):
#    print(np.where(countries==con))
#    conts[np.where(countries==con)] = i
    
source = ColumnDataSource(data={"x":mat[:,0], "y":mat[:,1], "label":y, 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

#output_file("iris.html", title="iris.py example")

show(p)

In [114]:
from math import log
gdp = pd.read_csv("gdp.csv")
gdp = gdp[gdp["Country Name"].isin(y)]
for i in y:
    happened = False
    for j in gdp["Country Name"]:
        if i == j:
            happened = True
    if not happened:
        print(i)
gdp_x = gdp["2010"].to_numpy()
gdp_x = np.array(list(map(lambda x: log(x), gdp_x)))
gdp_y = gdp["1970"].to_numpy()
gdp_y = np.array(list(map(lambda x: log(x), gdp_x)))

pca = TSNE(n_components=1,random_state=0,perplexity=20)
mat = pca.fit_transform(X,y)
p = figure(title = "Regression ?")
p.xaxis.axis_label = 'log GDP'
p.yaxis.axis_label = 'T-SNE'
colors = ["red","blue","green","yellow","black","brown"]
countries = [pc.country_alpha2_to_continent_code(pc.country_name_to_country_alpha2(x)) for x in dims["country"]]
#countries = [np.unique(countries).index(x) for x in countries]
uniq = list(np.unique(countries))
conts = [uniq.index(x) for x in countries]
#for i,con in zip(list(range(len(np.unique(countries)))),np.unique(countries)):
#    print(np.where(countries==con))
#    conts[np.where(countries==con)] = i
    
source = ColumnDataSource(data={"x":gdp_x, "y":mat[:], "label":y, 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

#output_file("iris.html", title="iris.py example")

show(p)

Great Britain
Hong Kong
Iran
South Korea
Russia
Taiwan
Venezuela



ColumnDataSource's columns must be of the same length. Current lengths: ('color', 62), ('label', 62), ('x', 55), ('y', 62)



In [7]:
di = pd.read_excel("6-dimensions-for-website-2015-08-16.xls")
for i,row in di.iterrows():
    for j in dims["country"]:
        if row["country"] == j:
            di.at[i,"ctr"] = pc.country_name_to_country_alpha3(row["country"])
di.to_csv("6-dimensions_new.csv")

In [54]:
import pandas as pd
import plotly.graph_objs as go
import plotly.offline as py

py.init_notebook_mode(connected=True)

#df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2014_world_gdp_with_codes.csv')
df = pd.read_csv('gdp.csv')


data = [go.Choropleth(
    #locations = df['CODE'],
    #z = df['GDP (BILLIONS)'],
    #text = df['COUNTRY'],
    locations = df['Country Code'],
    z = df['2016'],
    text = df['Country Name'],
    colorscale= [
        [0,"red"],
        [0.8,"orange"],
        [0.95,"yellow"],
        [1,"white"],
    ],
    autocolorscale = False,
    reversescale = True,
    marker = go.choropleth.Marker(
        line = go.choropleth.marker.Line(
            color = 'black',
            width = 1
        )),
    colorbar = go.choropleth.ColorBar(
        tickprefix = '$',
        title = 'GDP<br>Billions US$'),
)]

layout = go.Layout(
    title = go.layout.Title(
        text = '2016 Global GDP'
    ),
    geo = go.layout.Geo(
        showframe = False,
        showcoastlines = False,
        projection = go.layout.geo.Projection(
            type = 'equirectangular'
        )
    ),
    annotations = [go.layout.Annotation(
        x = 0.55,
        y = 0.1,
        xref = 'paper',
        yref = 'paper',
        text = 'Source: World Bank 2016',
        showarrow = False
    )]
)

fig = go.Figure(data = data, layout = layout)
py.iplot(fig, filename = 'd3-world-map')

In [106]:
df = pd.read_csv('gdp_region.csv')

p = figure(title = "Croissance régionale du PIB et moyenne mondiale",
           x_axis_label="1975 GDP",y_axis_label="2016 GDP",
                  x_axis_type="log",y_axis_type="log")

source = ColumnDataSource(data=df)

p.circle(x="1975",y="2016", source=source, fill_alpha=0.2, size=10)
mi,ma=df.iloc[df['1975'].idxmin()],df.iloc[df['2016'].idxmax()]
p.line([ma["1975"]//100,ma["1975"]],[ma["2016"]//100,ma["2016"]],line_width=2)

labels = LabelSet(x="1975",y="2016",source=source, text="Country Name", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

show(p)

    PIB par capita

In [123]:
gdp = pd.read_csv("gdp_per_cap.csv")
gdp = gdp[gdp["Country Name"].isin(y)]
gdp_x = gdp["2010"].to_numpy()
gdp_x = np.array(list(map(lambda x: log(x), gdp_x)))
gdp_y = gdp["1970"].to_numpy()
gdp_y = np.array(list(map(lambda x: log(x), gdp_x)))

In [124]:
p = figure(title = "PIB en fonction de l'index de distance au pouvoir")
p.xaxis.axis_label = 'IDP'
p.yaxis.axis_label = 'log PIB'

#X = dims[['pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]
   
source = ColumnDataSource(data={"x":dims["pdi"], "y":gdp_x, "label":dims["country"], 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

show(p)


ColumnDataSource's columns must be of the same length. Current lengths: ('color', 62), ('label', 62), ('x', 62), ('y', 55)



In [125]:
p = figure(title = "PIB en fonction de l'index d'individualisme'")
p.xaxis.axis_label = 'IDP'
p.yaxis.axis_label = 'log PIB'

#X = dims[['pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]
   
source = ColumnDataSource(data={"x":dims["idv"], "y":gdp_x, "label":dims["country"], 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

show(p)


ColumnDataSource's columns must be of the same length. Current lengths: ('color', 62), ('label', 62), ('x', 62), ('y', 55)



In [126]:
p = figure(title = "PIB en fonction de l'index de masculinité/féminité")
p.xaxis.axis_label = 'IDP'
p.yaxis.axis_label = 'log PIB'

#X = dims[['pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]
   
source = ColumnDataSource(data={"x":dims["mas"], "y":gdp_x, "label":dims["country"], 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

show(p)


ColumnDataSource's columns must be of the same length. Current lengths: ('color', 62), ('label', 62), ('x', 62), ('y', 55)



In [127]:
p = figure(title = "PIB en fonction de l'index d'esquive de l'incertitude")
p.xaxis.axis_label = 'IDP'
p.yaxis.axis_label = 'log PIB'

#X = dims[['pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]
   
source = ColumnDataSource(data={"x":dims["uai"], "y":gdp_x, "label":dims["country"], 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

show(p)


ColumnDataSource's columns must be of the same length. Current lengths: ('color', 62), ('label', 62), ('x', 62), ('y', 55)



In [128]:
p = figure(title = "PIB en fonction de l'index de vision au long terme")
p.xaxis.axis_label = 'IDP'
p.yaxis.axis_label = 'log PIB'

#X = dims[['pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]
   
source = ColumnDataSource(data={"x":dims["ltowvs"], "y":gdp_x, "label":dims["country"], 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

show(p)


ColumnDataSource's columns must be of the same length. Current lengths: ('color', 62), ('label', 62), ('x', 62), ('y', 55)



In [129]:
p = figure(title = "PIB en fonction de l'index d'indulgence'")
p.xaxis.axis_label = 'IDP'
p.yaxis.axis_label = 'log PIB'

#X = dims[['pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]
   
source = ColumnDataSource(data={"x":dims["ivr"], "y":gdp_x, "label":dims["country"], 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

show(p)


ColumnDataSource's columns must be of the same length. Current lengths: ('color', 62), ('label', 62), ('x', 62), ('y', 55)



In [130]:
p = figure(title = "PIB en fonction de l'index de distance au pouvoir")
p.xaxis.axis_label = 'IDP'
p.yaxis.axis_label = 'log PIB'

#X = dims[['pdi', 'idv', 'mas', 'uai', 'ltowvs', 'ivr']]
   
source = ColumnDataSource(data={"x":dims["pdi"], "y":gdp_y, "label":dims["country"], 
                                "color":list([colors[int(i)] for i in conts])})

p.circle(x="x",y="y",color="color", source=source, fill_alpha=0.2, size=10)

labels = LabelSet(x="x",y="y",source=source, text="label", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  text_align='center')
p.add_layout(labels)

show(p)


ColumnDataSource's columns must be of the same length. Current lengths: ('color', 62), ('label', 62), ('x', 62), ('y', 55)

