Chargement des diverses librairies dont j'aurai besoin pour la réalisation de l'EDA 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Importation de certaines librairies qui permettent de faire des graphiques plus interactifs

In [2]:
import plotly.tools as tls
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import warnings
from collections import Counter

# 1. Découverte des données

Connaissance des informations principales des données brutes

Chargement du dataset

In [None]:
df = pd.read_csv("../downloads/dataset.csv")
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
print(f'Le dataset comporte {df.shape[0]} samples et {df.shape[1]} variables')

In [None]:
df_col = df.columns.tolist()
print('Les 15 variables sont les suivantes :')
df_col

In [None]:
df.dtypes

In [None]:
num_var = ['ID','goal','pledged','backers','usd pledged','usd_pledged_real','usd_goal_real']
obj_var = ['name','category','main_category','currency','deadline','launched','state','country']

print(f'Les variables de type numérique sont les suivantes :\n\n{num_var}\n\n')
print(f'Les variables de type objet/catégorie sont les suivantes:\n\n{obj_var}')

In [None]:
df.describe().T

# 2. Données manquantes et/ou dupliquées

In [None]:
duplicated_values = df.duplicated().sum()
print(f'Le dataset comporte {duplicated_values} valeurs dupliquées.')

In [None]:
null_c = df.isnull().sum(axis=0).sort_values(ascending=False)*100/df.shape[0]
print("les features ont des données manquantes {}".format(list(null_c[null_c !=0].index)))
a = null_c.values
b = null_c.index
sns.barplot(x=a, y=b  )
plt.xlabel("ratio %")
plt.ylabel("feature in data")
plt.title("ratio of missing data")

In [None]:
col_nan = df.columns[df.isnull().any()]
X_text = {'usd pledged'}
print(f'Les variables contenant des données manquantes sont les suivantes :\n\n{X_text}')

In [None]:
df.count().plot.bar(title='Somme des données par variables', figsize=(15, 5))

Pour gérer les valeurs manquantes, bien que quasiment insignifiante, nous allons utiliser la methode mode

In [None]:
df.loc[df['usd pledged'].isnull(),'usd pledged'] = df['usd pledged'].mode().ravel()[0]

## La colonne 'state' semble être la colonne clé de ce dataset


Nous allons nous pencher sur son analyse, car c'est elle qui nous donne l'état de realisation d'un projet. 

In [None]:

percentual_sucess = round(df["state"].value_counts() / len(df["state"]) * 100,2)

print("State Percentual in %: ")
print(percentual_sucess)

state = round(df["state"].value_counts() / len(df["state"]) * 100,2)

labels = list(state.index)
values = list(state.values)

trace1 = go.Pie(labels=labels, values=values, marker=dict(colors=['red']))

layout = go.Layout(title='Distribution of States', legend=dict(orientation="h"));

fig = go.Figure(data=[trace1], layout=layout)
iplot(fig)



Constat: 

Cette distribution nous permet d'avoir un vue globale des différents projets chez Kickstarter.

On peut aisement faire le constat que 35,4% des projets rencontrent un 'Successful' 
Environ le 1/3 des projets lancés chez Kickstarter aboutissent.

Par contre 52,2% des projets rencontrent un 'failed'
Plus de la moitié des projets sont des échecs.

Environ 10,2% des projets s'arrêtent en cours de route 'canceled'



## On va s'interesser maintenant à la valeur qu'on souhaite prédire ...

Pour ce faire, nous allons explorer les distributions logarithme de ces valeurs. 

In [None]:
df_failed = df[df["state"] == "failed"]
df_sucess = df[df["state"] == "successful"]

#First plot
trace0 = go.Histogram(
    x= np.log(df.usd_goal_real + 1),
    histnorm='probability', showlegend=False,
    xbins=dict(
        start=-5.0,
        end=19.0,
        size=1),
    autobiny=True)

#Second plot
trace1 = go.Histogram(
    x = np.log(df.usd_pledged_real + 1),
    histnorm='probability', showlegend=False,
    xbins=dict(
        start=-1.0,
        end=17.0,
        size=1))

# Add histogram data
x1 = np.log(df_failed['usd_goal_real']+1)
x2 = np.log(df_sucess["usd_goal_real"]+1)

trace3 = go.Histogram(
    x=x1,
    opacity=0.60, nbinsx=30, name='Goals Failed', histnorm='probability'
)
trace4 = go.Histogram(
    x=x2,
    opacity=0.60, nbinsx=30, name='Goals Sucessful', histnorm='probability'
)


data = [trace0, trace1, trace3, trace4]
layout = go.Layout(barmode='overlay')

#Creating the grid
fig = tls.make_subplots(rows=2, cols=2, specs=[ [{'colspan': 2}, None], [{}, {}]],
                          subplot_titles=('Failed and Sucessful Projects',
                                          'Goal','Pledged'))

#setting the figs
fig.append_trace(trace0, 2, 1)
fig.append_trace(trace1, 2, 2)
fig.append_trace(trace3, 1, 1)
fig.append_trace(trace4, 1, 1)

fig['layout'].update(title="Distribution of projects",
                     height=500, width=900, barmode='overlay')
iplot(fig)

## Analyse des variables continues 

In [None]:
print("Min Goal and Pledged values")
print(df[["goal", "pledged"]].min())
print("")
print("Mean Goal and Pledged values")
print(round(df[["goal", "pledged"]].mean(),2))
print("")
print("Median Goal and Pledged values")
print(df[["goal", "pledged"]].median())
print("")
print("Max Goal and Pledged values")
print("goal       100000000.0") #If i put the both together give me back log values, 
print("pledged     20338986.27") # so i decide to just show this values
print("dtype: float64")
print("")
print("Std Goal and Pledged values")
print(round(df[["goal", "pledged"]].std(),2))



# 3. Outliers

In [None]:
df.boxplot(figsize=(15,5))


# 4. Analyse Univariée

In [None]:
df.main_category.value_counts().plot(kind= 'bar', title= 'MAIN CATEGORY')

In [None]:
df.state.value_counts().plot(kind= 'bar', title= 'STATE')

## Analisons plus en détail les catégories 

- Sucessful category's frequency
- failed category's frequency
- General Goal Distribuition by Category


In [None]:
main_cats = df["main_category"].value_counts()
main_cats_failed = df[df["state"] == "failed"]["main_category"].value_counts()
main_cats_sucess = df[df["state"] == "successful"]["main_category"].value_counts()

In [None]:
# premier plot
trace0 = go.Bar(
    x=main_cats_failed.index,
    y=main_cats_failed.values,
    name="Failed Category's"
)
# deuxieme plot
trace1 = go.Bar(
    x=main_cats_sucess.index,
    y=main_cats_sucess.values,
    name="Sucess Category's"
)
# troisieme plot
trace2 = go.Bar(
    x=main_cats.index,
    y=main_cats.values,
    name="All Category's Distribution"
)

# Creation du graphe 
fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Failed','Sucessful', "General Category's"))

# parametrage des graphes 
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)

fig['layout'].update(showlegend=True, title="Main Category's Distribution",bargap=0.05)
iplot(fig)


## Analyse de 'Goal' et de 'Pledged' by State

In [None]:
print("Analyse de 'Goal' et 'Pledged' moyenne par 'state' ")
print(round(df.groupby(["state"])["goal", "usd_pledged_real"].mean(),2))

In [None]:
categorys_failed = df[df["state"] == "failed"]["category"].value_counts()[:25]
categorys_sucessful = df[df["state"] == "successful"]["category"].value_counts()[:25]
categorys_general = df["category"].value_counts()[:25]

# premier plot
trace0 = go.Histogram(
    x=df[(df.category.isin(categorys_failed.index.values)) & 
              (df["state"] == "failed")]['category'].head(100000),
    histnorm='percent', name="Top 15 Failed", showlegend=False
)
# deuxieme plot
trace1 = go.Histogram(
    x=df[(df.category.isin(categorys_sucessful.index.values)) & 
              (df["state"] == "successful")]['category'].head(100000),
    histnorm='percent', name="Top 15 Sucessful", showlegend=False
)

# troisieme plot
trace2 = go.Histogram(
    x=df[(df.category.isin(categorys_general.index.values))]['category'].head(100000),
    histnorm='percent', name="Top 25 All Category's", showlegend=False
)

# creation du graphe
fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Top 15 Failed','Top 15 Sucessful', "Top 25 All Category's"))

# parametrage des graphes 
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)

fig['layout'].update(showlegend=True, title="Top Frequency Category's")
iplot(fig)

In [None]:
# premier plot
trace0 = go.Box(
    x=df[(df.category.isin(categorys_failed.index.values)) & 
              (df["state"] == "failed")]['category'],
    y=df[(df.category.isin(categorys_failed.index.values)) & 
              (df["state"] == "failed")]['pledged_log'].head(100000),
    name="Failed Category's", showlegend=False
)

# deuxieme plot
trace1 = go.Box(
    x=df[(df.category.isin(categorys_sucessful.index.values)) & 
              (df["state"] == "successful")]['category'],
    y=df[(df.category.isin(categorys_sucessful.index.values)) & 
              (df["state"] == "successful")]['pledged_log'].head(100000),
    name="Sucessful Category's", showlegend=False
)

# troisieme plot
trace2 = go.Box(
    x=df[(df.category.isin(categorys_general.index.values))]['category'],
    y=df[(df.category.isin(categorys_general.index.values))]['pledged_log'].head(100000),
    name="All Category's Distribuition", showlegend=False
)

# creation de graphe 
fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Failed','Sucessful', "General Category's", ))

# parametrage de graphe
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)

fig['layout'].update(showlegend=True, title="Main Category's Distribution")
iplot(fig)

In [None]:
cat_fill = df[df.category.isin(categorys_failed[:10].index.values)]

# premier plot
trace0 = go.Box(
    x=cat_fill['category'],
    y=cat_fill['goal_log'].head(100000),
    name="Failed Category's", showlegend=False
)

# deuxieme plot
trace1 = go.Box(
    x=cat_fill['category'],
    y=cat_fill['pledged_log'].head(100000),
    name="Sucessful Category's", showlegend=False
)

# troisieme plot
trace2 = go.Box(
    x=cat_fill['category'],
    y=np.log(cat_fill['diff_pledged_goal'] + 1).head(100000),
    name="Pledged", showlegend=False
)

# Creation de graphe 
fig = tls.make_subplots(rows=2, cols=2, specs=[[{}, {}], [{'colspan': 2}, None]],
                          subplot_titles=('Goal Log','Pledged Log', "Diff of Pledged and Goal", ))

# parametrage de graphe 
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)
fig.append_trace(trace2, 2, 1)

fig['layout'].update(showlegend=True, 
                     title="Distribution of Values by Top 10 Categorys")
iplot(fig)

## Nous nous focalisons sur les top 'succes' et les top 'fail' projects

In [None]:
sucess_music = df[(df['main_category'] == 'Music') & 
                      (df['state'] == 'successful')]
sucess_filme_video = df[(df['main_category'] == 'Film & Video') & 
                      (df['state'] == 'successful')]
sucess_games = df[(df['main_category'] == 'Games') & 
                      (df['state'] == 'successful')]

plt.figure(figsize=(12,12))

plt.subplot(3,1,1)
ax0 = sns.countplot(x='category', data=sucess_music)
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=45)
ax0.set_title("Categorys of Music with Sucess", fontsize=15)
ax0.set_xlabel("Music categories", fontsize=12)
ax0.set_ylabel("Counts", fontsize=12)

plt.subplot(3,1,2)
ax1 = sns.countplot(x='category', data=sucess_filme_video)
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=45)
ax1.set_title("Categorys of Film & Video with Sucess", fontsize=15)
ax1.set_xlabel("Film and Video Categorys", fontsize=12)
ax1.set_ylabel("Counts", fontsize=12)

plt.subplot(3,1,3)
ax2 = sns.countplot(x='category', data=sucess_games)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=45)
ax2.set_title("Category Games with Sucess", fontsize=15)
ax2.set_xlabel("Categorys of Games with Sucess", fontsize=12)
ax2.set_ylabel("Counts", fontsize=12)

plt.subplots_adjust(wspace = 0.3, hspace = 0.9,top = 0.9)

plt.show()

## Main Category

In [None]:
failed_film = df[(df['main_category'] == 'Film & Video') & 
                      (df['state'] == 'failed')]
failed_publishing = df[(df['main_category'] == 'Publishing') & 
                      (df['state'] == 'failed')]
failed_music = df[(df['main_category'] == 'Music') & 
                      (df['state'] == 'failed')]

plt.figure(figsize=(12,12))

plt.subplot(3,1,1)
ax0 = sns.countplot(x='category', data=failed_film)
ax0.set_xticklabels(ax0.get_xticklabels(),rotation=45)
ax0.set_title("Film & Video Most Fail Category's ", fontsize=15)
ax0.set_xlabel("", fontsize=12)
ax0.set_ylabel("Counts", fontsize=12)

plt.subplot(3,1,2)
ax1 = sns.countplot(x='category', data=failed_publishing)
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=45)
ax1.set_title("Publishing Most Fail Category's", fontsize=15)
ax1.set_xlabel("", fontsize=12)
ax1.set_ylabel("Counts", fontsize=12)

plt.subplot(3,1,3)
ax2 = sns.countplot(x='category', data=failed_music)
ax2.set_xticklabels(ax2.get_xticklabels(),rotation=45)
ax2.set_title("Music Most Fail Category's", fontsize=15)
ax2.set_xlabel("", fontsize=12)
ax2.set_ylabel("Counts", fontsize=12)


plt.subplots_adjust(wspace = 0.5, hspace = 0.9,top = 0.9)
plt.show()

En ce qui concerne la musique, les meilleurs succes sont Indie
Les echecs concernent le Rock et le Hip Hop
Par contre, les documentaires occupent une place non négligeable dans le classement....


##  Le temps, une autre donnée, evaluons le en fonction des autres critères ...

In [None]:
df['launched'] = pd.to_datetime(df['launched'])
df['laun_month_year'] = df['launched'].dt.to_period("M")
df['laun_year'] = df['launched'].dt.to_period("A")
df['laun_hour'] = df['launched'].dt.hour

df['deadline'] = pd.to_datetime(df['deadline'])
df['dead_month_year'] = df['deadline'].dt.to_period("M")
df['dead_year'] = df['launched'].dt.to_period("A")

In [None]:

# Ajoutons une nouvelle colonne a notre Data Frame Campaign total months. 
df['time_campaign'] = df['dead_month_year'] - df['laun_month_year']
df['time_campaign'] = df['time_campaign'].astype(int)



In [None]:

plt.figure(figsize = (10,6))

ax = sns.countplot(x='time_campaign', hue='state', 
                   data=df[df['time_campaign'] < 10])
ax.set_title("Distribuition of Campaign Time by State", fontsize=30)
ax.set_xlabel("Campaign Total Months", fontsize=20)
ax.set_ylabel("Count", fontsize=20)
plt.show()

print("Descriptions of Campaign Time x State")
print(pd.crosstab(df[df['time_campaign'] < 5]['time_campaign'], df.state))

In [None]:
df.laun_month_year = df.laun_month_year.dt.strftime('%Y-%m')
df.laun_year = df.laun_year.dt.strftime('%Y')


In [None]:

year = df['laun_year'].value_counts()
month = df['laun_month_year'].value_counts()

fig, ax = plt.subplots(2,1, figsize=(12,10))

plt.subplot(211)
ax1 = sns.boxplot(x="laun_year", y='pledged_log', 
                  data=df[(df.laun_year > '2008') & (df.laun_year < '2018')])
ax1.set_title("Project Pledged by Year", fontsize=15)
ax1.set_xlabel("Years", fontsize=12)
ax1.set_ylabel("Pledged(log)", fontsize=12)

plt.subplot(212)
ax2 = sns.countplot(x="laun_year", hue='state', 
                    data=df[(df.laun_year > '2008') & (df.laun_year < '2018')])
ax2.set_title("Projects count by Year", fontsize=18)
ax2.set_xlabel("State columns by Year", fontsize=15)
ax2.set_ylabel("Count", fontsize=15)

#order=['1970','2009','2010','2011','2012',
#'2013','2014','2015', '2016', '2017','2018']
# Pourquoi l'ordre ne fonctionne pas? 
plt.show()

print("Descriptive status count by year")
print(pd.crosstab(df.laun_year, df.state))


## Analysons les distributions de la difference entre  'Pledged sucessful' et  'failed Projects'


In [None]:
plt.figure(figsize = (12,6))
sns.distplot(df[(df['diff_pledged_goal'] < 200) & 
                     (df['state'] == 'failed')]['diff_pledged_goal'], color='r')
sns.distplot(df[(df['diff_pledged_goal'] < 200) & 
                     (df['state'] == 'successful')]['diff_pledged_goal'],color='g')
plt.show()


In [None]:
plt.figure(figsize = (18,15))

plt.subplots_adjust(hspace = 0.35, top = 0.8)

g1 = plt.subplot(211)
g1 = sns.countplot(x="laun_month_year", data=df[df['laun_month_year'] >= '2010-01'])
g1.set_xticklabels(g1.get_xticklabels(),rotation=90)
g1.set_title("Value Distribution by Date Distribution", fontsize=30)
g1.set_xlabel("Date Distribution", fontsize=20)
g1.set_ylabel("Count", fontsize=20)

g2 = plt.subplot(212)
g2 = sns.boxplot(x="laun_year", y="diff_pledged_goal",
                 data=df[df['diff_pledged_goal'] < 150], 
                 hue="state")
g2.set_xticklabels(g2.get_xticklabels(),rotation=90)
g2.set_title("Value Distribution by Date Distribution", fontsize=20)
g2.set_xlabel("Date Distribution", fontsize=20)
g2.set_ylabel("Goal x Pledged (%)", fontsize=20)
plt.show()

In [None]:
plt.figure(figsize = (14,10))

plt.subplots_adjust(hspace = 0.50, top = 0.8)

plt.subplot(311)
g =sns.boxplot(x='state', y='goal_log', 
            data=df[df['time_campaign'] < 10], 
            hue='time_campaign')
g.set_title("State Goal's by Campaign Time", fontsize=24)
g.set_xlabel("", fontsize=20)
g.set_ylabel("Goal(log)", fontsize=20)

plt.subplot(312, sharex=g)
g1 = sns.boxplot(x='state', y='pledged_log', 
            data=df[df['time_campaign'] < 10], 
            hue='time_campaign')
g1.set_title("State Pledged's by Campaign Time", fontsize=24)
g1.set_xlabel("", fontsize=20)
g1.set_ylabel("Pledged(log)", fontsize=20)

plt.subplot(313)
g2 = sns.boxplot(x='state', y='diff_pledged_goal', 
            data=df)
g2.set_title("State % of Goal reached by Campaign Time", fontsize=30)
g2.set_xlabel("State", fontsize=20)
g2.set_ylabel("Percentual Goal", fontsize=20)
plt.show()

In [None]:

df['backers_log'] = np.log(df['backers'] + 1 ) 
#The + 1 is to normalize the zero or negative values

plt.figure(figsize = (8,6))
sns.distplot(df['backers_log'])

plt.show()



In [None]:
plt.figure(figsize = (12,8))

plt.subplot(211)
g = sns.violinplot(x='state',y='backers_log', 
               data=df)
g.set_title("Backers by STATE", fontsize=18)

plt.subplot(212)
g = sns.violinplot(x='main_category',y='backers_log', 
                   data=df)
g.set_xticklabels(g.get_xticklabels(),rotation=45)

plt.show()

## backers

In [None]:
plt.figure(figsize = (12,8))

plt.subplot(211)
g = sns.boxplot(x='laun_year',y='backers_log', 
               data=df[(df.laun_year > '2008') & (df.laun_year < '2018')])
g.set_title("Backers by YEAR", fontsize=18)

plt.show()


Analyse de la relation entre les financeurs et les buts atteints  'Backers' and '% of goal reached'

In [None]:
sns.lmplot(x='diff_pledged_goal', y ='backers_log', 
           data=df[df['diff_pledged_goal'] < 2000], height = 5, aspect = 2,
           hue='state')
plt.show()


# CONCLUSION

In [None]:
La plupart des projets qui sont financés sont ceux qui aboutissent le mieux

Les projets les plus financés sont les 'Comics' et les 'Games'

Les campagnes ne guarantissent pas forcement le succès d'un projet, 'failed', 'succes' et 'canceled' sont quasiment au même niveau lors de notre analyse
'state goals par campaign time'


