#### Install all requirements for the project

In [2]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn



#### Importing Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Path
DOCS_PATH = 'docs/'
DATA_PATH = 'data/'

# Ayiti Analytics Data Processing Bootcamp
  Ayiti Analytics Data wants to expand its training centers throughout all the communes of the country. Your role as a data analyst is to help them realize this dream.

Its objective is to know which three communes of the country will be the most likely to expand its training centers.

Knowing that each cohort must have 30 students
 
* How many applications must be made to select 25% women for each on average

* What are the most effective communication channels (Alumni, Facebook, WhatsApp, Friend ...) that will allow a student to be susceptible to selection
 
* What is the average number of university students who should participate in this program
* What will be the average number of applications per week that we could have
* How many weeks should we extend the application process to select 60 students per commune?
* If we were to do all the bootcamp online, who would be the best communes and how many applications would we need to select 30 student and what percentage of students would have a laptop, an internet connection, both at the same time
* What are the most effective communication channels (Alumni, Facebook, WhatsApp, Friend ...) that will allow a women to be susceptible to selection

### NB 
Use the same framework of the BA project to complete this project

>## Commune dataset

#### Importing Data

In [60]:
commune_df = pd.read_excel((DATA_PATH + "commune.xlsx"))
commune_df.head(5)

Unnamed: 0,Commune_en,Commune_FR,Commune_Id,Departement,ADM1_PCODE
0,Abricots,Abricots,HT0812,Grande'Anse,HT08
1,Acul du Nord,Acul du Nord,HT0321,North,HT03
2,Anse-a-Foleur,Anse-à-Foleur,HT0922,North-West,HT09
3,Anse-a-Pitre,Anse-à-Pître,HT0234,South-East,HT02
4,Anse-a-Veau,Anse-à-Veau,HT1021,Nippes,HT10


#### Data overview

In [58]:
# Shapes
row, col = commune_df.shape
print("This datasset have",row,"rows","and",col,'columns')

This datasset have 140 rows and 5 columns


In [10]:
# Data Types
commune_df.dtypes

Commune_en     object
Commune_FR     object
Commune_Id     object
Departement    object
ADM1_PCODE     object
dtype: object

In [11]:
# Data info
commune_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Commune_en   140 non-null    object
 1   Commune_FR   140 non-null    object
 2   Commune_Id   140 non-null    object
 3   Departement  140 non-null    object
 4   ADM1_PCODE   140 non-null    object
dtypes: object(5)
memory usage: 5.6+ KB


In [12]:
# Describe the dataset
commune_df.describe()

Unnamed: 0,Commune_en,Commune_FR,Commune_Id,Departement,ADM1_PCODE
count,140,140,140,140,140
unique,140,140,140,10,10
top,La Chapelle,Torbeck,HT0311,West,HT01
freq,1,1,1,20,20


#### Missing Values

In [136]:
# dealing with missing data
commune_df.isna().sum()

Commune_en     0
Commune_Id     0
Departement    0
dtype: int64

In [134]:
# Make sure our columns don't have duplicate values.
commune_df.duplicated().sum()

0

**Conclusion:** just checking that there's no missing and duplicated Data... All the missing values in the Commune dataset have either been removed or filled.

#### Clean the data

In [145]:
# Drop all columns we will no longer use for this project
# commune_df.drop(columns=["Commune_FR" , "ADM1_PCODE"], axis='columns', inplace=True)
# commune_df.head(10)

>## Enroll Dataset

#### Import Dataset

In [82]:
# Affiche done enroll yo
enroll_df = pd.read_csv(DATA_PATH + "enroll.csv")
enroll_df.head()

Unnamed: 0.1,Unnamed: 0,created_at,expiry_date,user_id,course_name,course_id,percentage_completed,completed_at,expired,is_free_trial,completed,started_at,activated_at,updated_at,quest_id
0,0,2020-12-12T06:41:29.844Z,,29816173,Entwodiksyon Nan Syans Done,1047613,0.0,,False,True,False,,,2020-12-12T06:41:29.844Z,
1,1,2020-12-13T14:41:17.720Z,,35229766,Entwodiksyon Nan Syans Done,1047613,0.0,,False,True,False,,,2020-12-13T14:41:17.720Z,
2,2,2020-12-15T00:53:06.499Z,,31987560,Entwodiksyon Nan Syans Done,1047613,0.0,,False,True,False,,,2020-12-15T00:53:06.499Z,
3,3,2020-12-18T18:38:55.683Z,,35616451,Entwodiksyon Nan Syans Done,1047613,0.0,,False,True,False,,,2020-12-18T18:38:55.683Z,
4,4,2020-12-21T14:45:06.528Z,,35390649,Entwodiksyon Nan Syans Done,1047613,0.0,,False,True,False,,,2020-12-21T14:45:06.528Z,


In [81]:
show_p = enroll_df["percentage_completed"].loc[(enroll_df["percentage_completed"] >= 0.5)]
show_p

13     0.930233
16     0.697674
20     0.534884
22     0.953488
27     0.906977
37     0.833333
44     0.813953
45     0.619048
48     0.511628
50     0.860465
52     0.953488
58     0.523810
68     0.953488
73     0.534884
75     0.906977
76     0.883721
77     0.953488
82     0.744186
85     0.953488
88     0.674419
89     0.790698
90     0.906977
92     0.906977
99     0.813953
103    0.604651
108    0.604651
111    0.883721
124    0.883721
129    0.697674
133    0.767442
139    0.604651
Name: percentage_completed, dtype: float64

#### Data overview

In [147]:
enroll_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148 entries, 0 to 147
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            148 non-null    int64  
 1   created_at            148 non-null    object 
 2   expiry_date           60 non-null     object 
 3   user_id               148 non-null    int64  
 4   course_name           148 non-null    object 
 5   course_id             148 non-null    int64  
 6   percentage_completed  148 non-null    float64
 7   completed_at          0 non-null      float64
 8   expired               148 non-null    bool   
 9   is_free_trial         148 non-null    bool   
 10  completed             148 non-null    bool   
 11  started_at            135 non-null    object 
 12  activated_at          77 non-null     object 
 13  updated_at            148 non-null    object 
 14  quest_id              102 non-null    object 
dtypes: bool(3), float64(2),

In [150]:
enroll_df.describe()

Unnamed: 0.1,Unnamed: 0,user_id,course_id,percentage_completed,completed_at
count,148.0,148.0,148.0,148.0,0.0
mean,73.5,39824230.0,1047613.0,0.249925,
std,42.868014,1782956.0,0.0,0.313651,
min,0.0,29816170.0,1047613.0,0.0,
25%,36.75,39898020.0,1047613.0,0.0,
50%,73.5,40097490.0,1047613.0,0.02381,
75%,110.25,40624870.0,1047613.0,0.418605,
max,147.0,41592700.0,1047613.0,0.953488,


#### Missing Values

In [158]:
enroll_df.isna().sum()

Unnamed: 0                0
created_at                0
expiry_date              88
user_id                   0
course_name               0
course_id                 0
percentage_completed      0
completed_at            148
expired                   0
is_free_trial             0
completed                 0
started_at               13
activated_at             71
updated_at                0
quest_id                 46
dtype: int64

#### Clean the data

>## Industry Dataset

In [65]:
# Affiche done industry yo
industry_df = pd.read_csv(DATA_PATH + "industry.csv")
industry_df.head(3)

Unnamed: 0.1,Unnamed: 0,key,values,quest_id
0,0,item1,Finance,905ddcf2-ce95-11eb-9a92-7c67a234f601
1,1,item6,Technology (Software/ Internet),905e2608-ce95-11eb-809f-7c67a234f601
2,2,item5,Education,905e2608-ce95-11eb-809f-7c67a234f601


>## Ord Dataset

In [66]:
# Affiche done ord yo
ord_df = pd.read_csv(DATA_PATH + "ord.csv")
ord_df.head(3)

Unnamed: 0.1,Unnamed: 0,created_at,user_id,product_name,product_id,amount_dollars,amount_cents,subscription,coupon_code,coupon_id,affiliate_referral_code,status,quest_id
0,0,2021-03-14T15:34:35.014Z,41587952,Entwodiksyon Nan Syans Done,1100636,20,2000,False,,,,Complete,906b3c86-ce95-11eb-b6a7-7c67a234f601
1,1,2021-03-14T04:20:23.483Z,41564742,Entwodiksyon Nan Syans Done,1100636,20,2000,False,,,,Complete,906636cf-ce95-11eb-8679-7c67a234f601
2,2,2021-03-12T19:29:22.832Z,41501853,Entwodiksyon Nan Syans Done,1100636,20,2000,False,,,,Complete,906968f5-ce95-11eb-9e1a-7c67a234f601


>## Quest dataset

In [67]:
# Affiche done quest yo
quest_df = pd.read_csv(DATA_PATH + "quest.csv")
quest_df.head(3)

Unnamed: 0.1,Unnamed: 0,gender,dob,commune,created_at,modified_at,department,education_level,university,study_domain,current_employed,formal_sector_job,have_computer_home,internet_at_home,hear_AA_1,after_AA,quest_id
0,0,male,25/08/1998,ht0111,2021-02-14T23:03:52.768400Z,2021-02-14T23:03:52.768462Z,,unknown,unknown,[],unknown,unknown,unknown,unknown,unknown,unknown,905dc006-ce95-11eb-b2f9-7c67a234f601
1,1,female,29/09/1996,ht0111,2021-02-16T16:01:10.861844Z,2021-02-16T16:01:10.861899Z,ht01,Bachelors (bacc +4),other,['other'],No,unknown,Yes,Yes,Friend,Finding a job/internship,905ddcf2-ce95-11eb-9a92-7c67a234f601
2,2,male,17/05/1996,ht0111,2021-02-16T16:56:30.579504Z,2021-02-16T16:56:30.579563Z,ht01,Bachelors (bacc +4),Université d'Etat d'Haïti (UEH),[],No,unknown,Yes,Yes,Friend,Improving my data analysis skills,905e2608-ce95-11eb-809f-7c67a234f601


>## Study domain Dataset

In [68]:
# Affiche done Study domain yo
studyd_df = pd.read_csv(DATA_PATH + "study_domain.csv")
studyd_df.head(3)

Unnamed: 0.1,Unnamed: 0,key,values,quest_id
0,0,other,other,905ddcf2-ce95-11eb-9a92-7c67a234f601
1,1,item1,Computer Science,905e4cf5-ce95-11eb-96a9-7c67a234f601
2,2,item6,Management,905e4cf5-ce95-11eb-96a9-7c67a234f601


>## Technology Dataset

In [69]:
# Affiche done technology yo
tech_df = pd.read_csv(DATA_PATH + "technology.csv")
tech_df.head(3)

Unnamed: 0.1,Unnamed: 0,key,values,quest_id
0,0,item1,R,905ddcf2-ce95-11eb-9a92-7c67a234f601
1,1,item11,Excel,905ddcf2-ce95-11eb-9a92-7c67a234f601
2,2,item2,Python,905e2608-ce95-11eb-809f-7c67a234f601


>## Transaction dataset

In [70]:
# Affiche done industry yo
transaction_df = pd.read_csv(DATA_PATH + "transaction.csv")
transaction_df.head(3)

Unnamed: 0.1,Unnamed: 0,created_at,modified_at,user_id,course_id,transaction_id
0,0,2021-03-08T05:08:50.832Z,2021-03-08T05:08:50.832Z,40976440,1047613,4641281952
1,1,2021-03-08T17:26:35.841Z,2021-03-08T17:26:35.841Z,41179271,1047613,4644270160
2,2,2021-03-08T18:52:09.147Z,2021-03-08T18:52:09.147Z,40973512,1047613,4645070622
