In [151]:
# Importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from datetime import datetime as dt, date

## BUSINESS PROBLEM-1

### Performing EDA and removing noise from the data.

In [385]:
# Reading the file.
loans_data= pd.read_csv(r"F:\AnalyticslabFIles\Projects\python\4. Basic Statistics - Hypothesis Testing\LoansData.csv")

In [359]:
# Checking for null values.

loans_data.isna().sum()

Amount.Requested                   1
Amount.Funded.By.Investors         1
Interest.Rate                      2
Loan.Length                        0
Loan.Purpose                       0
Debt.To.Income.Ratio               1
State                              0
Home.Ownership                     1
Monthly.Income                     1
FICO.Range                         2
Open.CREDIT.Lines                  3
Revolving.CREDIT.Balance           3
Inquiries.in.the.Last.6.Months     3
Employment.Length                 77
dtype: int64

In [360]:
# Removing the noise values and converting the Interest.Rate column to numeric
loans_data['Interest.Rate']= pd.to_numeric(loans_data['Interest.Rate'].str.replace('%', ""))

In [379]:
# Creating new column 'FICO_scores' from FICO.Range
loans_data['FICO_scores']= pd.to_numeric(loans_data['FICO.Range'].str.split('-', expand= True)[0])

In [380]:
# Filling the null values with mode of columns:

for i in ['Amount.Requested', 'Amount.Funded.By.Investors', 
          'Interest.Rate', 'Debt.To.Income.Ratio', 'Home.Ownership',
          'Monthly.Income', 'FICO.Range', 'Open.CREDIT.Lines', 'Revolving.CREDIT.Balance',
         'Inquiries.in.the.Last.6.Months', 'Employment.Length']:
    loans_data.fillna(loans_data[i].mode()[0], inplace= True)

In [381]:
loans_data.isna().sum()

Amount.Requested                  0
Amount.Funded.By.Investors        0
Interest.Rate                     0
Loan.Length                       0
Loan.Purpose                      0
Debt.To.Income.Ratio              0
State                             0
Home.Ownership                    0
Monthly.Income                    0
FICO.Range                        0
Open.CREDIT.Lines                 0
Revolving.CREDIT.Balance          0
Inquiries.in.the.Last.6.Months    0
Employment.Length                 0
FICO_scores                       0
dtype: int64

#### BUSINESS PROBLEM:
#### Using lending club loans data, the team would like to test below hypothesis on how different  factors effecing each other (Hint: You may leverage hypothesis testing using statistical tests)

##### a. Intrest rate is varied for different loan amounts (Less intrest charged for high loan amounts)

In [363]:
# Viewing the data
loans_data.head(2)

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,20000.0,20000.0,8.9,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year
1,19200.0,19200.0,12.12,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years


In [364]:
# Checking the correlation between the Interest rate and Amount funded by the investors.
loans_data.loc[:, ['Interest.Rate', 'Amount.Funded.By.Investors']].corr()
# The correlation between the two variables is weak which shows that Interest.Rate does not big significance on Amount.Funded.By.Investors

Unnamed: 0,Interest.Rate,Amount.Funded.By.Investors
Interest.Rate,1.0,0.015944
Amount.Funded.By.Investors,0.015944,1.0


##### b. Loan length is directly effecting intrest rate.

In [351]:
# Samples with different Loan.Length.
s1= loans_data.loc[loans_data['Loan.Length']== '36 months', 'Interest.Rate'] 
s2= loans_data.loc[loans_data['Loan.Length']== '60 months', 'Interest.Rate']

In [352]:
# H0: both are from the same population i,e loan length has no effect on Interest rate. 
# Ha: both are from the different population i,e loan length has effect on Interest rate.
# CI: 95%, p-value: 0.05

# Perform the test:
stats.ttest_ind(s1, s2)


Ttest_indResult(statistic=0.43584197622598564, pvalue=0.6629890455123574)

In [160]:
# Business conclusion:
print('Business conclusion: Since p value > 0.05 , we are fail to reject the H0 i,e both the means are different and thus loan length has no effect on the Interest rate')

Business conclusion: Since p value > 0.05 , we are fail to reject the H0 i,e both the means are different and thus loan length has no effect on the Interest rate


##### c. Inrest rate varies for different purpose of loans

In [366]:
# Samples with different Loan.Purpose

s1= loans_data.loc[loans_data['Loan.Purpose']== 'debt_consolidation', 'Interest.Rate']
s2= loans_data.loc[loans_data['Loan.Purpose']== 'credit_card', 'Interest.Rate']
s3= loans_data.loc[loans_data['Loan.Purpose']== 'other', 'Interest.Rate']
s4= loans_data.loc[loans_data['Loan.Purpose']== 'moving', 'Interest.Rate']
s5= loans_data.loc[loans_data['Loan.Purpose']== 'car', 'Interest.Rate']
s6= loans_data.loc[loans_data['Loan.Purpose']== 'vacation', 'Interest.Rate']
s7= loans_data.loc[loans_data['Loan.Purpose']== 'home_improvement', 'Interest.Rate']
s8= loans_data.loc[loans_data['Loan.Purpose']== 'house', 'Interest.Rate']
s9= loans_data.loc[loans_data['Loan.Purpose']== 'major_purchase', 'Interest.Rate']
s10= loans_data.loc[loans_data['Loan.Purpose']== 'educational', 'Interest.Rate']
s11= loans_data.loc[loans_data['Loan.Purpose']=='medical', 'Interest.Rate']
s12= loans_data.loc[loans_data['Loan.Purpose']== 'wedding', 'Interest.Rate']
s13= loans_data.loc[loans_data['Loan.Purpose']== 'small_business', 'Interest.Rate']
s14= loans_data.loc[loans_data['Loan.Purpose']=='renewable_energy', 'Interest.Rate']

In [367]:
# Mean of samples

print('mean of s1:', s1.mean(), 
      '| mean of s2:', s2.mean(), 
      '| mean of s3:', s3.mean(),
      '| mean of s4:', s4.mean(), 
      '| mean of s5:', s5.mean(), 
      '| mean of s6:', s6.mean(), 
      '| mean of s7:', s7.mean(),
      '| mean of s8:', s8.mean(),
      '| mean of s9:', s9.mean(),
      '| mean of s10:', s10.mean(),
      '| mean of s11:', s11.mean(),
      '| mean of s12:', s12.mean(), 
      '| mean of s13:', s13.mean(),
      '| mean of s14:', s14.mean())

mean of s1: 13.586916602907444 | mean of s2: 35.54378378378376 | mean of s3: 13.159552238805967 | mean of s4: 13.621034482758622 | mean of s5: 11.113000000000001 | mean of s6: 11.966190476190476 | mean of s7: 11.594276315789479 | mean of s8: 13.448000000000002 | mean of s9: 10.797821782178215 | mean of s10: 11.007333333333332 | mean of s11: 11.649666666666668 | mean of s12: 268.14846153846156 | mean of s13: 12.839195402298845 | mean of s14: 9.8775


In [368]:
# H0: All the samples are from the same population i,e interest rate does not varies for different purpose of loans
# Ha: All the samples are from the different population i,e interest rate does vary for different purpose of loans
# CI: 95%, p-value: 0.05

# Perform the test:
stats.f_oneway(s1, s2, s3, s3, s4, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14)

F_onewayResult(statistic=2.406413964865984, pvalue=0.0018009747325657183)

In [165]:
# Business conclusion:
print('Business conclusion: Since p-value < 0.05, we are rejecting the null hypothesis ( H0 ) i,e all samples are from different population which means interest rate does vary for different purpose of loans')

Business conclusion: Since p-value < 0.05, we are rejecting the H0 ( null hypothesis ), i,e all samples are from different population which means interest rate does vary for different purpose of loans


##### d. There is relationship between FICO scores and Home Ownership. It means that, People with owning home will have high FICO scores.

In [383]:
s1= loans_data.loc[loans_data['Home.Ownership']== 'MORTGAGE', 'FICO_scores' ]
s2= loans_data.loc[loans_data['Home.Ownership']== 'RENT', 'FICO_scores' ]
s3= loans_data.loc[loans_data['Home.Ownership']== 'OTHER', 'FICO_scores' ]
s4= loans_data.loc[loans_data['Home.Ownership']== 'NONE', 'FICO_scores' ]
s5= loans_data.loc[loans_data['Home.Ownership']== 'OWN', 'FICO_scores' ]

In [384]:
# H0: All the samples have the same mean i,e they belong to same population which means that FICO score does not have any effect on Home.Ownership 
# Ha: All the samples have different mean i,e they belong to different population which means that FICO score does have an impact on Home.Ownership
# CI: 95%, p-value: 0.05

# Performing the test:
stats.f_oneway(s1, s2, s3, s4, s5)

F_onewayResult(statistic=0.3681610332553651, pvalue=0.8314484927825438)

In [220]:
# Business conclusion:
print('Business conclusion: Since p-value > 0.05, we fail to reject the null hypothesis which means FICO scores does not have any impact on Home.Ownership')

Business conclusion: Since p-value > 0.05, we fail to reject the null hypothesis which means FICO scores does not have any impact on Home.Ownership


## BUSINESS PROBLEM - 2

In [221]:
# Reading the file.
price_quotes= pd.read_csv(r"F:\AnalyticslabFIles\Projects\python\4. Basic Statistics - Hypothesis Testing\Price_Quotes.csv")

### Performing EDA and removing noise from the data.

In [386]:
# Checking for null values.
price_quotes.isna().sum()

Order_Number    0
Barry_Price     0
Mary_Price      0
dtype: int64

In [387]:
# Viewing the data.
price_quotes.head(2)

Unnamed: 0,Order_Number,Barry_Price,Mary_Price
0,1,126,114
1,2,110,118


#### BUSINESS PROBLEM: We would like to assess if there is any difference in the average price quotes provided by Mary and Barry.

In [388]:
# H0: Prices given by Barry and Mary are equal
# Ha: Prices given by Barry and Mary are not equal
# CI: 95%, p-value: 0.05

# Perform the test:
stats.ttest_ind(a= price_quotes.Barry_Price, b= price_quotes.Mary_Price)

Ttest_indResult(statistic=1.4147436739281787, pvalue=0.17114226132118285)

In [224]:
# Business conclusion:
print('Business conclusion: Since the p-value > 0.05, we failed to reject the null hypothesis which means prices given by Barry & Mary are almost equal.')

Business conclusion: Since the p-value > 0.05, we failed to reject the null hypothesis which means prices given by Barry & Mary are almost equal.


## BUSINESS PROBLEM-3:

In [225]:
# Reading the data.
treamet_facility= pd.read_csv(r"F:\AnalyticslabFIles\Projects\python\4. Basic Statistics - Hypothesis Testing\Treatment_Facility.csv")

### Performing EDA and removing noise from the data.

In [390]:
# Cheking for null values.
treamet_facility.isna().sum()

Month                0
Reengineer           0
Employee_Turnover    0
VAR4                 0
VAR5                 0
dtype: int64

In [391]:
# Viewing the data.
treamet_facility.head(2)

Unnamed: 0,Month,Reengineer,Employee_Turnover,VAR4,VAR5
0,1,Prior,0.0,24.390244,42.682927
1,2,Prior,6.0606,19.354839,25.806452


#### BUSINESS PROBLEM: Determine what effect, if any, the reengineering effort had on the incidence behavioral problems and staff turnover. i.e To determine if the reengineering effort changed the critical incidence rate. Isthere evidence that the critical incidence rate improved?

In [392]:
# Samples 
s1= treamet_facility.loc[treamet_facility.Reengineer == 'Prior', 'VAR5']
s2= treamet_facility.loc[treamet_facility.Reengineer == 'Post', 'VAR5']

In [393]:
# H0: mean of s1 and s2 is same i,e Reenginering has impacted the Critical Incidence rate.
# Ha: mean of s1 and s2 is not equal i,e Reengneering has not impacted the Critical Incidence rate.
# CI: 95%, p-value: 0.05
# Perform the test:

stats.ttest_ind(s1, s2)

Ttest_indResult(statistic=1.6279144253528646, pvalue=0.12091989189884149)

In [394]:
# Business conclusion:
print('Business conclusion: Since the p-value > 0.05, which means that Reengineering has impacted the Critical Incidence rate.')

Business conclusion: Since the p-value > 0.05, which means that Reengineering has impacted the Critical Incidence rate.


## BUSINESS PROBLEM-4

In [232]:
# Reading the file.
priority_assesment= pd.read_csv(r"F:\AnalyticslabFIles\Projects\python\4. Basic Statistics - Hypothesis Testing\Priority_Assessment.csv")

### Performing EDA and removing noise from the data.

In [396]:
priority_assesment.isna().sum()

Days        0
Priority    0
dtype: int64

In [395]:
# Viewing the data.
priority_assesment.head(2)

Unnamed: 0,Days,Priority
0,3.3,High
1,7.9,Medium


#### BUSINESS PROBLEM: We will focus on the prioritization system. If the system is working, then high priority jobs, on average, should be completed more quickly than medium priority jobs, and medium priority jobs should be completed more quickly than low priority jobs. Use the data provided to determine whether this is, in fact, occurring.

In [398]:
# Samples.
s1= priority_assesment.loc[priority_assesment.Priority == 'High', 'Days']
s2= priority_assesment.loc[priority_assesment.Priority == 'Medium', 'Days']
s3= priority_assesment.loc[priority_assesment.Priority == 'Low', 'Days']

In [399]:
# H0: means of all the samples will belong to the same population i,e prioritization of work is  not working.
# Ha: Means of all the samples will belong to the different population which show that prioritizing work is working.
# CI: 95%, p-value: 0.05
# Perform the test:

stats.f_oneway(s1, s2, s3)

F_onewayResult(statistic=1.812311010076072, pvalue=0.16411459461716182)

In [236]:
# Business conclusion:
print('Business conclusion: Since p-value > 0.05, we fail to reject the null hypothesis which means that prioritization of is not working in properly.')

Business conclusion: Since p-value > 0.05, we fail to reject the null hypothesis which means that prioritization of is not working in properly.


## BUSINESS PROBLEM-5

In [401]:
# Reading the file.
films= pd.read_csv(r"F:\AnalyticslabFIles\Projects\python\4. Basic Statistics - Hypothesis Testing\Films.csv")

### Performing EDA and removing noise from the data.

In [402]:
films.isna().sum()

_rowstate_         0
Movie              0
Gender             0
Marital_Status     2
Sinage             2
Parking            2
Clean              3
Overall            2
Age                2
Income            16
Hear_About         7
dtype: int64

In [403]:
for i in films.columns:
    films.fillna(films[i].mode()[0], inplace= True)

In [289]:
films.isna().sum()

_rowstate_        0
Movie             0
Gender            0
Marital_Status    0
Sinage            0
Parking           0
Clean             0
Overall           0
Age               0
Income            0
Hear_About        0
dtype: int64

In [322]:
# Removing noise from the data.
films.Gender = np.where(films.Gender == '1', 'Male', np.where(films.Gender == '2', 'Female', np.where(films.Gender == 'Male', "Male", 'Female')))
films.Marital_Status = np.where(films.Marital_Status == '1', 'Married', np.where(films.Marital_Status == '2', 'Single', np.where(films.Marital_Status == 'Married', "Married", 'Single')))

In [260]:
# Viewing the data.
films.head(2)

Unnamed: 0,_rowstate_,Movie,Gender,Marital_Status,Sinage,Parking,Clean,Overall,Age,Income,Hear_About
0,0,Ferris Buellers Day Off,Female,Married,2.0,2.0,2.0,2.0,3.0,1.0,5
1,0,Ferris Buellers Day Off,Female,Single,1.0,1.0,1.0,1.0,2.0,1.0,5


#### BUSINESS PROBLEM: Use the survey resultsto addressthe following questions
####  What is the overall level of customer satisfaction?

In [257]:
sample= films.Overall
sample_mean= sample.mean()
sample_std= sample.std()
SE= sample_std/(np.sqrt(sample.count()))
# Population level
# at 95% CI
print('sample mean=', sample_mean, '| standard deviation=', sample_std, '| SE=', SE, '| mean - 1.960 * SE =', sample_mean - (1.960 * SE), '| mean + 1.960 * SE =', sample_mean + (1.960 * SE)) 

sample mean= 1.6189024390243902 | standard deviation= 0.6892863057161788 | SE= 0.03805946932136542 | mean - 1.960 * SE = 1.5443058791545141 | mean + 1.960 * SE = 1.6934989988942664


In [259]:
# Business conclusion:
print('Business conclusion: We are 95% that people have thier movie satishfaction between 1.54 and 1.69.')

Business conclusion: We are 95% that people have thier movie satishfaction between 1.54 and 1.69.


####  What factors are linked to satisfaction?

In [265]:
# checking the relationship between Gender and Overall

# Samples
s1= films.loc[films.Gender == 'Female', 'Overall']
s2= films.loc[films.Gender== 'Male', 'Overall']

# H0: mean of s1 = mean of s2 i,e Gender has no effect on Overall
# Ha: mean of s2 <> mean of s2 i,e Gender has effect on Overall
# CI: 95%, p-value: 0.05

# Perform the test:
print( 'Performing independent sample ttest:', stats.ttest_ind(s1, s2))

# Business conclusion: 
print('Business conclusion: Since p-value < 0.05, we reject the H0 which mena Gender has effect on Overall film experiance')

Performing independent sample ttest: Ttest_indResult(statistic=-2.5888027174359816, pvalue=0.01471174342681416)
Business conclusion: Since p-value < 0.05, we reject the H0 which mena Gender has effect on Overall film experiance


In [302]:
# checking the relationship between Age and Overall

# Samples
a1= films.loc[films.Age == 1., 'Overall']
a2= films.loc[films.Age == 2., 'Overall']
a3= films.loc[films.Age == 3., 'Overall']
a4= films.loc[films.Age == 4., 'Overall']
a0= films.loc[films.Age == 0., 'Overall']

# H0: mean of all samples are equal i,e Age has no effect on Overall
# Ha: mean of all samples are not equal i,e Age has effect on Overall
# CI: 95%, p-value: 0.05

# Perform the test:
print( 'Performing ANOVA or f-test:', stats.f_oneway(a1, a2, a3, a4, a0))

# Business conclusion: 
print('Business conclusion: Since p-value > 0.05, we fail to reject the H0 which means Age has no effect on Overall film experiance')

Performing ANOVA or f-test: F_onewayResult(statistic=0.9787002217923516, pvalue=0.4192434652806395)
Business conclusion: Since p-value > 0.05, we fail to reject the H0 which means Age has no effect on Overall film experiance


In [301]:
# checking the relationship between Income and Overall

# Samples
a1= films.loc[films.Income == 1., 'Overall']
a2= films.loc[films.Income == 2., 'Overall']
a3= films.loc[films.Income == 3., 'Overall']
a0= films.loc[films.Income == 0., 'Overall']

# H0: mean of all samples are equal i,e Income has no effect on Overall
# Ha: mean of all samples are not equal i,e Income has effect on Overall
# CI: 95%, p-value: 0.05

# Perform the test:
print( 'Performing ANOVA or f-test:', stats.f_oneway(a1, a2, a3, a0))

# Business conclusion: 
print('Business conclusion: Since p-value > 0.05, we fail to reject the H0 which means Income has no effect on Overall film experiance')


Performing ANOVA or f-test: F_onewayResult(statistic=0.045733037036391096, pvalue=0.9870019828888965)
Business conclusion: Since p-value > 0.05, we fail to reject the H0 which means Income has no effect on Overall film experiance


In [317]:
# checking the relationship between Hear_about and Overall

# Samples
a1= films.loc[films.Hear_About == '1', 'Overall']
a2= films.loc[films.Hear_About == '2', 'Overall']
a3= films.loc[films.Hear_About == '3', 'Overall']
a4= films.loc[films.Hear_About == '4', 'Overall']
a5= films.loc[films.Hear_About == '5', 'Overall']
a6= films.loc[films.Hear_About == 0, 'Overall']
a7= films.loc[films.Hear_About == '4,5', 'Overall']
a8= films.loc[films.Hear_About == '2,5', 'Overall']
a9= films.loc[films.Hear_About == '1,5', 'Overall']
a10= films.loc[films.Hear_About == '3,4', 'Overall']
a11= films.loc[films.Hear_About == '5,4', 'Overall']
a12= films.loc[films.Hear_About == '3,5', 'Overall']

# H0: mean of all samples are equal i,e Hear_about has no effect on Overall
# Ha: mean of all samples are not equal i,e Hear_about has effect on Overall
# CI: 95%, p-value: 0.05

# Perform the test:
print( 'Performing ANOVA or f-test:', stats.f_oneway(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12))

# Business conclusion: 
print('Business conclusion: Since p-value > 0.05, we fail to reject the H0 which means Hear_about has no effect on Overall film experiance')


Performing ANOVA or f-test: F_onewayResult(statistic=1.4442669994680088, pvalue=0.15182523046361193)
Business conclusion: Since p-value > 0.05, we fail to reject the H0 which means Hear_about has no effect on Overall film experiance


####  What is the demographic profile of Film on the Rocks patrons?

In [324]:
# Percentage of males and females.

(films.pivot_table(values= 'Movie', index= 'Gender', aggfunc= 'count') * 100)/films.Gender.count()

Unnamed: 0_level_0,Movie
Gender,Unnamed: 1_level_1
Female,64.545455
Male,35.454545


In [328]:
# Percentage of Married and Singles.
(films.pivot_table(values= 'Movie', index= 'Marital_Status', aggfunc= 'count') * 100)/films.Gender.count()

Unnamed: 0_level_0,Movie
Marital_Status,Unnamed: 1_level_1
Married,30.30303
Single,69.69697


In [329]:
# Percentage of different age groups.
(films.pivot_table(values= 'Movie', index= 'Age', aggfunc= 'count') * 100)/films.Gender.count()

Unnamed: 0_level_0,Movie
Age,Unnamed: 1_level_1
0.0,0.606061
1.0,7.878788
2.0,53.030303
3.0,35.454545
4.0,3.030303


In [330]:
# Percentage of different Income groups.
(films.pivot_table(values= 'Movie', index= 'Income', aggfunc= 'count') * 100)/films.Gender.count()

Unnamed: 0_level_0,Movie
Income,Unnamed: 1_level_1
0.0,4.848485
1.0,43.030303
2.0,24.848485
3.0,27.272727


In [332]:
# Percentage of different Hear_abouts.
(films.pivot_table(values= 'Movie', index= 'Hear_About', aggfunc= 'count') * 100)/films.Gender.count()

Unnamed: 0_level_0,Movie
Hear_About,Unnamed: 1_level_1
0,2.121212
1,6.666667
15,0.30303
2,3.636364
25,0.606061
3,4.242424
34,0.606061
35,0.30303
4,12.424242
45,0.30303


####  In what media outlet(s) should the film series be advertised?

In [333]:
films.Hear_About.value_counts()

5      226
4       41
1       22
3       14
2       12
0        7
2,5      2
3,4      2
4,5      1
5,4      1
1,5      1
3,5      1
Name: Hear_About, dtype: int64

In [334]:
print('From the above data we can see that film should be advertised in Word of mouth or through websites.')

The film should be advertised in Word of mouth or through websites
