# Load Libraries/Data

In [82]:
## Standard Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
## Importing the OS and JSON Modules
import os,json

import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists

from scipy import stats

import warnings
warnings.filterwarnings('ignore')

## Load JSON File

In [11]:
with open('Data/Mock_Crowdsourcing_API_Results.json') as f:
    json_file = pd.read_json(f)

In [12]:
json_file

Unnamed: 0,meta,data
crowd,Practice Lesson: Mock API Call,"[{'id': 658776, 'posted_time': '2014-01-17 21:..."
demographics,Practice Lesson: Mock API Call,"[{'id': 658776, 'country': 'El Salvador', 'reg..."
financials,Practice Lesson: Mock API Call,"[{'id': 658776, 'funded_amount': '$1000.0', 'c..."
use,Practice Lesson: Mock API Call,"[{'id': 658776, 'activity': 'Vehicle', 'sector..."


# **Extract 4 Sets of Data**

## "crowd" Data

In [13]:
df_crowd = pd.DataFrame(json_file.loc["crowd", "data"])
df_crowd

Unnamed: 0,id,posted_time,funded_time,lender_count
0,658776,2014-01-17 21:21:10+00:00,2014-02-05 17:57:55+00:00,33
1,1314847,2017-06-07 02:02:41+00:00,2017-06-21 17:10:38+00:00,9
2,863063,2015-03-27 20:08:04+00:00,2015-04-04 15:01:22+00:00,1
3,1184347,2016-11-14 07:32:12+00:00,2016-11-25 03:07:13+00:00,47
4,729745,2014-06-24 07:35:46+00:00,2014-07-10 16:12:43+00:00,12
...,...,...,...,...
9995,679499,2014-03-05 07:05:38+00:00,2014-03-13 01:01:41+00:00,11
9996,873525,2015-04-22 06:32:13+00:00,,6
9997,917686,2015-07-15 11:53:33+00:00,2015-08-14 11:45:40+00:00,44
9998,905789,2015-06-22 07:44:18+00:00,2015-07-14 00:20:45+00:00,11


## "demographics" Data

In [14]:
df_demo = pd.DataFrame(json_file.loc["demographics", "data"])
df_demo

Unnamed: 0,id,country,region,borrower_genders
0,658776,El Salvador,Ciudad El Triunfo,male
1,1314847,Philippines,"Bais, Negros Oriental",female
2,863063,Peru,Huarochiri,"female, female, female, female, female, female..."
3,1184347,Armenia,Vanadzor town,female
4,729745,Uganda,Masindi,female
...,...,...,...,...
9995,679499,Pakistan,Lahore,female
9996,873525,Kenya,Machakos,"male, male, female, female, male"
9997,917686,Senegal,,"female, female"
9998,905789,Philippines,"Binalbagan, Negros Occidental",female


## "financials" Data

In [15]:
df_fin = pd.DataFrame(json_file.loc["financials", "data"])
df_fin

Unnamed: 0,id,funded_amount,currency,term_in_months
0,658776,$1000.0,USD,20.0
1,1314847,$225.0,PHP,13.0
2,863063,$1150.0,PEN,6.0
3,1184347,$1700.0,AMD,26.0
4,729745,$400.0,UGX,8.0
...,...,...,...,...
9995,679499,400.0,PKR,12.0
9996,873525,375.0,KES,14.0
9997,917686,1375.0,XOF,8.0
9998,905789,450.0,PHP,13.0


## "use" Data

In [16]:
df_use = pd.DataFrame(json_file.loc["use", "data"])
df_use

Unnamed: 0,id,activity,sector,use
0,658776,Vehicle,Personal Use,to purchase a motorcycle in order to travel fr...
1,1314847,Pigs,Agriculture,to buy feed and other supplies like vitamins t...
2,863063,Bookstore,Retail,"to buy notebooks, pencils, and pens."
3,1184347,Photography,Services,to pay for a new lens for providing photograph...
4,729745,Fuel/Firewood,Retail,to buy firewood to sell.
...,...,...,...,...
9995,679499,Fruits & Vegetables,Food,to help her husband buy onions for resale.
9996,873525,Farming,Agriculture,to buy fertilizer and pesticides to boost his ...
9997,917686,Fish Selling,Food,buy fish
9998,905789,General Store,Retail,to buy more groceries to sell.


# **Transform**

## Remove '$' From 'funded_amount' in Financials Data
- And change to numeric
***
- No need to normalize
- No need to clean or preprocess

In [17]:
df_fin['funded_amount'] = df_fin['funded_amount'].str.replace('$', '')
df_fin.head(2)

Unnamed: 0,id,funded_amount,currency,term_in_months
0,658776,1000.0,USD,20.0
1,1314847,225.0,PHP,13.0


In [18]:
df_fin['funded_amount'].dtype

dtype('O')

In [19]:
df_fin['funded_amount'] = df_fin['funded_amount'].astype('float')
df_fin['funded_amount'].dtype

dtype('float64')

# **Load**

## Convert to MySQL Database

### Create Database 'Mock_Exam'

In [23]:
connection_str = "mysql+pymysql://root:root@localhost/mock_exam"

In [24]:
engine = create_engine(connection_str)

In [26]:
if database_exists(connection_str) == False:
  create_database(connection_str)
else:
  print('The database already exists')

The database already exists


### Upload DataFrames as Tables

In [27]:
df_crowd.to_sql('crowd', engine, if_exists = 'replace')

10000

In [28]:
df_demo.to_sql('demographics', engine, if_exists = 'replace')

10000

In [29]:
df_fin.to_sql('financials', engine, if_exists = 'replace')

10000

In [30]:
df_use.to_sql('use', engine, if_exists = 'replace')

10000

### SQL Query "Show Tables;" for Database

In [31]:
q = """

SHOW TABLES;

"""

pd.read_sql(q, engine)

Unnamed: 0,Tables_in_mock_exam
0,crowd
1,demographics
2,financials
3,use


# **Hypothesis Test**
*“Is there is a significant difference between the funded amount when it is all males and when there is at least one female in the group?“*

## Create Two Groups "all_male" and "at_least_one_female"

### Merge Relevant DataFrames

In [32]:
df_merge = pd.merge(df_demo, df_fin, on='id')
df_merge

Unnamed: 0,id,country,region,borrower_genders,funded_amount,currency,term_in_months
0,658776,El Salvador,Ciudad El Triunfo,male,1000.0,USD,20.0
1,1314847,Philippines,"Bais, Negros Oriental",female,225.0,PHP,13.0
2,863063,Peru,Huarochiri,"female, female, female, female, female, female...",1150.0,PEN,6.0
3,1184347,Armenia,Vanadzor town,female,1700.0,AMD,26.0
4,729745,Uganda,Masindi,female,400.0,UGX,8.0
...,...,...,...,...,...,...,...
9995,679499,Pakistan,Lahore,female,400.0,PKR,12.0
9996,873525,Kenya,Machakos,"male, male, female, female, male",375.0,KES,14.0
9997,917686,Senegal,,"female, female",1375.0,XOF,8.0
9998,905789,Philippines,"Binalbagan, Negros Occidental",female,450.0,PHP,13.0


### Create Groups

In [73]:
df_merge = df_merge.dropna(subset="borrower_genders")

In [74]:
df_merge

Unnamed: 0,id,country,region,borrower_genders,funded_amount,currency,term_in_months
0,658776,El Salvador,Ciudad El Triunfo,male,1000.0,USD,20.0
1,1314847,Philippines,"Bais, Negros Oriental",female,225.0,PHP,13.0
2,863063,Peru,Huarochiri,"female, female, female, female, female, female...",1150.0,PEN,6.0
3,1184347,Armenia,Vanadzor town,female,1700.0,AMD,26.0
4,729745,Uganda,Masindi,female,400.0,UGX,8.0
...,...,...,...,...,...,...,...
9995,679499,Pakistan,Lahore,female,400.0,PKR,12.0
9996,873525,Kenya,Machakos,"male, male, female, female, male",375.0,KES,14.0
9997,917686,Senegal,,"female, female",1375.0,XOF,8.0
9998,905789,Philippines,"Binalbagan, Negros Occidental",female,450.0,PHP,13.0


In [77]:
at_least_one_female = df_merge.loc[df_merge['borrower_genders'].str.contains('female'),
                                   'funded_amount']
all_male = df_merge.loc[~df_merge['borrower_genders'].str.contains('female'),
                       'funded_amount']

In [78]:
at_least_one_female

1        225.0
2       1150.0
3       1700.0
4        400.0
5        350.0
         ...  
9995     400.0
9996     375.0
9997    1375.0
9998     450.0
9999     125.0
Name: funded_amount, Length: 7820, dtype: float64

In [79]:
all_male

0       1000.0
8        925.0
18       875.0
22       600.0
32       375.0
         ...  
9984    1000.0
9985     800.0
9991     125.0
9992     100.0
9993    3000.0
Name: funded_amount, Length: 2119, dtype: float64

In [80]:
print(at_least_one_female.mean())
print(all_male.mean())

771.0102301790281
802.2345445965078


## STEP 1: Stating the Hypothesis
- **Null Hypothesis**: there is no significant difference in funded amount when it is all males compared to when there is at least one female in the group


- **Alternative Hypothesis**: there is a significant difference in funded amount when it is all males compared to when there is at least one female in the group


- Alpha: 0.05

## STEP 2: Determine the Category/Type of Test Based on Data
- 2 Sample T-Test
    - Comparing numeric data
    - Two groups: all male and at least one female

## STEP 3: Does the Data Meet the Assumptions of the Selected Test

### Outliers

In [107]:
# all male

print(f'Number of rows: {len(all_male)}')

z_score = stats.zscore(all_male)
outliers = abs(z_score)>3

print(f'Number of outliers: {np.sum(outliers)}')

all_male = all_male[~outliers]
print(f'Number of rows: {len(all_male)}')

Number of rows: 2119
Number of outliers: 26
Number of rows: 2093


In [109]:
# at least one female

print(f'Number of rows: {len(at_least_one_female)}')

z_score = stats.zscore(at_least_one_female)
outliers = abs(z_score)>3

print(f'Number of outliers: {np.sum(outliers)}')

at_least_one_female = at_least_one_female[~outliers]
print(f'Number of rows: {len(at_least_one_female)}')

Number of rows: 7820
Number of outliers: 202
Number of rows: 7618


### Normality
- **Note**: This step can be skipped entirely since sample size for each group is > 15
- Does not meet Normality assumption
- Since sample size for both groups is > 15, can continue

In [111]:
norm_male = stats.normaltest(all_male)
print(f'{norm_male.pvalue:.10f}')
print(f'Significant: {norm_male.pvalue < 0.05}')

0.0000000000
Significant: True


In [112]:
norm_female = stats.normaltest(at_least_one_female)
print(f'{norm_female.pvalue:.10f}')
print(f'Significant: {norm_female.pvalue < 0.05}')

0.0000000000
Significant: True


### Equal Variance
- Fails to meet Equal Variance assumption
- Will include `equal_var = False` in T-test argument

In [113]:
e_var = stats.levene(all_male, at_least_one_female)
print(e_var)
print(f'Significant: {e_var.pvalue < 0.05}')

LeveneResult(statistic=5.919603200045773, pvalue=0.014991261165002913)
Significant: True


## STEP 4: Perform the Test & Interpret the Result
- Will **reject** null hypothesis that there is no significant difference in funded amount when it is all males compared to when there is at least one female in the group
    - Support alternative hypothesis


In [114]:
## Final t-test, after confirming we meet the assumptions
result = stats.ttest_ind(all_male, at_least_one_female, equal_var=False)
result

Ttest_indResult(statistic=4.570140894626427, pvalue=5.046604720900298e-06)

In [115]:
## is our result significant
print(f"p-value={result.pvalue:.10f}")
print(f"Significant: {result.pvalue <.05}")

p-value=0.0000050466
Significant: True


## Summary and Visualization
- Since we saw that there is a significant result in our T-test, we can support the Alternative Hypothesis
    - 