In [None]:
#PROJET analyse de Données: US 2020 Elections

In [None]:
import pandas as pd
import numpy as np
import prince

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from matplotlib import colors as mcol
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
import plotly.figure_factory as ff

import matplotlib.pyplot as plt
import seaborn as sns
import csv

In [None]:
#PAGE DE REFERENCE POUR LES CARTES:    https://plotly.com/python/tile-county-choropleth/
#ANCIENNE PAGE:     https://plotly.com/python/county-choropleth/

In [None]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

In [None]:
import plotly.express as px

In [None]:
US_data = pd.read_csv("./data.csv",sep=None, engine='python')

US_data = US_data.drop(US_data.columns[0], axis=1)

US_data.head()

In [None]:
dem = np.array(US_data["Democrat vote raw"])
rep = np.array(US_data["Republican vote raw"])

result = np.where(dem > rep, 'Democrat', 'Republican')

US_data['Parti'] = result

US_data.head()

In [None]:
#Le code FIPS est un code d'identification des contés aux US, on va l'utiliser pour afficher sur une carte des Etats-Unis les résultats des clustering. 
#On doit donc ajouter une colonne correspondant à ce code dans le data-set
#Cependant, la carte des USA que j'ai trouvée ne représente que les contés sur le territoire principal des USA (pas d'alaska, etc..)
#Il faudra donc en tenir compte dans la visualisation

fips_data = pd.read_csv("./national_county.txt", header=None, dtype=str)

# Assign column names based on Census format
fips_data.columns = ["State Abbr", "State FIPS Code", "County FIPS Code", "County Name", "FIPS Class Code"]

# Create a full FIPS code column (State + County)
fips_data["FIPS"] = fips_data["State FIPS Code"] + fips_data["County FIPS Code"]

# Drop unnecessary columns
fips_data = fips_data[["State Abbr", "County Name", "FIPS"]]

# Convert to lowercase for easy matching
fips_data["County Name"] = fips_data["County Name"].str.lower().str.strip().str.replace(r"\s+", "", regex=True)
fips_data["State Abbr"] = fips_data["State Abbr"].str.lower().str.strip()

In [None]:
US_data["State"] = US_data["State"].str.lower().str.strip().str.replace(r"\s+", "", regex=True)
US_data["County"] = US_data["County"].str.lower().str.strip().str.replace(r"\s+", "", regex=True)

# Load a mapping of state names to state abbreviations
state_abbr_url = "https://raw.githubusercontent.com/jasonong/List-of-US-States/master/states.csv"
state_abbr = pd.read_csv(state_abbr_url)
state_abbr.columns = ["State", "Abbreviation"]

# Convert to lowercase for easy matching
state_abbr["State"] = state_abbr["State"].str.replace(r"\s+", "", regex=True).str.lower().str.strip()
state_abbr["Abbreviation"] = state_abbr["Abbreviation"].str.lower().str.strip()


In [None]:
print(np.unique(US_data[['State']]))
print(np.unique(fips_data[['State Abbr']]))
print(np.unique(state_abbr[["Abbreviation"]]))
print(np.unique(state_abbr[["State"]]))

In [None]:
# Merge state names with abbreviations
US_data = US_data.merge(state_abbr, left_on="State", right_on="State", how="left")

# Merge with FIPS data
US_data = US_data.merge(fips_data, left_on=["Abbreviation", "County"], right_on=["State Abbr", "County Name"], how="left")

# Drop extra columns
US_data = US_data.drop(columns=["State Abbr", "County Name", "Abbreviation"])

In [None]:
US_data.head()

In [None]:
missing_fips = US_data[US_data['FIPS'].isna()]
print(missing_fips[['State','County']])

In [None]:
#remplissage main:

#US_data[US_data['FIPS'].isna()]['FIPS'] = [02063,02066,02158,02195,46102]

In [None]:
# Remplacer les valeurs "-" par NaN dans l'ensemble du DataFrame
US_data.replace("-", np.nan, inplace=True)

# Supprimer les lignes avec des NaN dans toutes les colonnes
US_data.dropna(inplace=True)

In [None]:
# Nettoyage des colonnes de pourcentages (enlève les % et les convertis en pourcentage)

colonnes_pourcentages = ["Less than 9th grade","9th to 12th grade","HS graduate and eq","College,No Degree","Associates Degree","Bachelors Degree","Graduate or professional degree"]

US_data[colonnes_pourcentages] = US_data[colonnes_pourcentages].replace('%', '', regex=True)
US_data[colonnes_pourcentages] = US_data[colonnes_pourcentages].apply(pd.to_numeric) / 100

# Suppression des virgules dans les colonnes Median income et Mean income
US_data['Median income'] = US_data['Median income'].astype(str).str.replace(",", "").astype(float)
US_data['Mean income'] = US_data['Mean income'].astype(str).str.replace(",", "").astype(float)

# Affichage des premières lignes pour vérifier
pd.set_option('display.max_columns', None) 
US_data.head()

In [None]:
fig = sns.catplot(data=US_data, x='Parti', kind='count')
#fig.set_xticklabels(["Men", "Women", "Children"])

plt.xlabel('')
plt.ylabel('')
plt.title('Nombre de county majoritairement républicains ou démocrates respectivement')
plt.show()

# --- #
print('')

In [None]:
data_democrat = pd.to_numeric(US_data.loc[US_data['Parti'] == 'Democrat', 'Less than 9th grade'], errors='coerce').dropna()
data_republican = pd.to_numeric(US_data.loc[US_data['Parti'] == 'Republican', 'Less than 9th grade'], errors='coerce').dropna()

fig, axs = plt.subplots(1, 2, figsize=(12, 6))

axs[0].boxplot(data_democrat)
axs[1].boxplot(data_republican)

fig.suptitle('Boxplot % Less than 9th grade', fontsize=16)

axs[0].boxplot(US_data.loc[US_data['Parti'] == 'Democrat', 'Less than 9th grade'])
axs[0].set_title('Democrat')

axs[1].boxplot(US_data.loc[US_data['Parti'] == 'Republican', 'Less than 9th grade'])
axs[1].set_title('Republican')

plt.subplots_adjust(wspace=0.4)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

fig.suptitle('Boxplot % HS graduate and eq', fontsize=16)

axs[0].boxplot(US_data.loc[US_data['Parti'] == 'Democrat', 'HS graduate and eq'])
axs[0].set_title('Democrat')

axs[1].boxplot(US_data.loc[US_data['Parti'] == 'Republican', 'HS graduate and eq'])
axs[1].set_title('Republican')

plt.subplots_adjust(wspace=0.4)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

fig.suptitle('Boxplot % College,No Degree', fontsize=16)

axs[0].boxplot(US_data.loc[US_data['Parti'] == 'Democrat', 'College,No Degree'])
axs[0].set_title('Democrat')

axs[1].boxplot(US_data.loc[US_data['Parti'] == 'Republican', 'College,No Degree'])
axs[1].set_title('Republican')

plt.subplots_adjust(wspace=0.4)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

fig.suptitle('Boxplot % Associates Degree', fontsize=16)

axs[0].boxplot(US_data.loc[US_data['Parti'] == 'Democrat', 'Associates Degree'])
axs[0].set_title('Democrat')

axs[1].boxplot(US_data.loc[US_data['Parti'] == 'Republican', 'Associates Degree'])
axs[1].set_title('Republican')

plt.subplots_adjust(wspace=0.4)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

fig.suptitle('Boxplot % Bachelors Degree', fontsize=16)

axs[0].boxplot(US_data.loc[US_data['Parti'] == 'Democrat', 'Bachelors Degree'])
axs[0].set_title('Democrat')

axs[1].boxplot(US_data.loc[US_data['Parti'] == 'Republican', 'Bachelors Degree'])
axs[1].set_title('Republican')

plt.subplots_adjust(wspace=0.4)

plt.show()

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(12, 6))

fig.suptitle('Boxplot % Graduate or professional degree', fontsize=16)

axs[0].boxplot(US_data.loc[US_data['Parti'] == 'Democrat', 'Graduate or professional degree'])
axs[0].set_title('Democrat')

axs[1].boxplot(US_data.loc[US_data['Parti'] == 'Republican', 'Graduate or professional degree'])
axs[1].set_title('Republican')

plt.subplots_adjust(wspace=0.4)

plt.show()
axs[0].boxplot(US_data.loc[US_data['Parti'] == 'Democrat', 'HS graduate and eq'])
axs[0].set_title('Democrat')

axs[1].boxplot(US_data.loc[US_data['Parti'] == 'Republican', 'HS graduate and eq'])
axs[1].set_title('Republican')

plt.subplots_adjust(wspace=0.4)

plt.show()

In [None]:
US_dataACP = US_data.drop(['County', 'State','Parti','FIPS'], axis=1)

pd.set_option('display.max_columns', None) 
US_dataACP.head()


In [None]:
scaler = StandardScaler()
US_data_scaled = scaler.fit_transform(US_dataACP)

# 2. Application de l'ACP
pca_US = PCA()
US_data_pca = pca_US.fit_transform(US_data_scaled)  # Corrected

In [None]:
explained_var_ratio = 100*pca_US.explained_variance_ratio_

plt.subplot(1,2,1)
n_bars = 20
plt.bar(np.arange(1,n_bars+1), explained_var_ratio[:n_bars], color='red')
plt.xlabel("Number of components")
plt.ylabel("Percentage of explained variance")

plt.subplot(1,2,2)
plt.plot(np.cumsum(explained_var_ratio),color='yellow')
plt.xlabel("Number of components")
plt.ylabel("Cumulative explained variance");

plt.tight_layout()
plt.show()

In [None]:
box = plt.boxplot(US_data_scaled[:,:10],patch_artist=True)
plt.setp(box["boxes"],facecolor="yellow",alpha=.5)
plt.title("Box plots of the first ten principal components")
plt.tight_layout()
plt.show()

In [None]:
coord1 = pca_US.components_[0] * np.sqrt(pca_US.explained_variance_[0])
coord2 = pca_US.components_[1] * np.sqrt(pca_US.explained_variance_[1])
coord3 = pca_US.components_[2] * np.sqrt(pca_US.explained_variance_[2])

fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1,1,1)
for i, j, nom in zip(coord1, coord2, US_data.columns):
    plt.text(i,j,nom,fontsize=10)
    plt.arrow(0,0,i,j,color = 'yellow', alpha=0.7,width = 0.0001)

plt.axis((-1,1,-1,1))
plt.gcf().gca().add_artist(plt.Circle((0,0), radius = 1, color = 'red', fill = False))

plt.title('Variables factor map - PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')

plt.grid(True)
plt.show()

In [None]:
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(1,1,1)
for i, j, nom in zip(coord1, coord3, US_data.columns):
    plt.text(i,j,nom,fontsize=10)
    plt.arrow(0,0,i,j,color = 'yellow', alpha=0.7,width = 0.0001)

plt.axis((-1,1,-1,1))
plt.gcf().gca().add_artist(plt.Circle((0,0), radius = 1, color = 'red', fill = False))

plt.title('Variables factor map - PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 3')

plt.grid(True)
plt.show()

In [None]:
US_data_reduced = US_data_pca[:, :10]

In [None]:
#déterminer le nombre de classes pour K-means:

#inertia = []
#for k in range(1,11):
#    kmeans = KMeans(n_clusters=k, init='k-means++', n_init='auto', max_iter=100, random_state=0)
#    kmeans.fit(US_data)
#    inertia.append(kmeans.inertia_)
#inertia = np.array(inertia)

#plt.scatter(range(2,11), inertia[1:])
#plt.show()

kmeans = KMeans(init='k-means++', n_init='auto', max_iter=100, random_state=42)
visualizer = KElbowVisualizer(kmeans, k=(4,12))

visualizer.fit(US_dataACP)
visualizer.show()

In [None]:
K=9
kmeans_pca = KMeans(n_clusters = K, init='k-means++', n_init='auto')
clusters_pca = kmeans_pca.fit_predict(US_data_reduced)

cmap = plt.get_cmap('Set1',K)
plt.bar(*np.unique(clusters_pca,return_counts=True),color=cmap.colors)
plt.ylabel("Frequency")
plt.show()

In [None]:
print(np.shape(US_data))
print(len(fips_data), len(clusters_pca))
US_data ['clusters'] = clusters_pca

In [None]:
colorscale = [cmap(i) for i in range(K)]
colorscale = [mcol.to_hex(cmap(i)) for i in range(K)]
fips = US_data ['FIPS'].tolist()
values = clusters_pca.tolist()

US_data['clusters'] = list(map(str, values))

fig = px.choropleth_map(US_data, geojson=counties, locations='FIPS', color='clusters',
                        color_discrete_sequence=colorscale,
                        map_style="carto-positron",
                        zoom=3, 
                        center = {"lat": 37.0902, "lon": -95.7129},
                        opacity=1,
                        title='répartition des clusters géographiquement aux USA')
fig.update_traces(marker_line_width=0)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
US_data = US_data.drop(columns=["clusters"])

In [None]:
logD = np.log(np.array(US_data["Density"]))

US_data['logDensity'] = logD

m = np.min(US_data['logDensity'])
M = np.max(US_data['logDensity'])

colorscale = [
    [0.0, "rgb(255,255,255)"],
    [0.1, "rgb(255,230,230)"],
    [0.3, "rgb(255,153,153)"],
    [0.5, "rgb(255,102,102)"],
    [0.7, "rgb(255,51,51)"],
    [0.9, "rgb(204,0,0)"],
    [1.0, "rgb(153,0,0)"]
]


fig = px.choropleth_map(US_data, geojson=counties, locations='FIPS', color='logDensity',
                           color_continuous_scale=colorscale,
                           range_color=(m, M),
                           map_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=1,
                           labels={'Density':'Counties log Density'}
                          )
fig.update_traces(marker_line_width=0)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
US_data = US_data.drop(columns=["logDensity"])

In [None]:
colorscale = [mcol.to_hex('red'),mcol.to_hex('blue')]


fig = px.choropleth_map(US_data, geojson=counties, locations='FIPS', color='Parti',
                           color_discrete_sequence=colorscale,
                           map_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=0.8,
                           labels={'répartition géogrpahique du parti majoritaire aux élections'}
                          )
fig.update_traces(marker_line_width=0)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
#colors: https://plotly.com/python/builtin-colorscales/
#les plages de couleur cmap sont bcp mieux mais j'ai la flm de convertir, on peut le faire si besoin

In [None]:
fig = px.choropleth_map(US_data, geojson=counties, locations='FIPS', color='Republican vote %',
                           color_continuous_scale="balance",
                           range_color=(0, 100),
                           map_style="carto-positron",
                           zoom=3, center = {"lat": 37.0902, "lon": -95.7129},
                           opacity=1,
                           labels={'répartition géographique du pourcentage de vote aux élections'}
                          )
fig.update_traces(marker_line_width=0)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()