# import package

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import scipy.stats as stats

### UDF

In [2]:
def fn_descriptives( x ):
    
    n_tot = x.isnull().count()
    n_miss = x.isnull().sum()
    n_miss_perc = n_miss / n_tot
    
    p25 = x.quantile(0.25)
    p75 = x.quantile(0.75)
    
    iqr = p75 - p25
    
    lc_iqr = p25 - 1.5 * iqr
    uc_iqr = p75 + 1.5 * iqr
    
    return pd.Series( [ x.dtype, x.nunique(), n_tot, x.count(), n_miss, n_miss_perc, 
                       x.sum(), x.var(), x.std(), x.mean(), 
                       iqr, lc_iqr, uc_iqr, 
                       x.min(), x.quantile(0.01), x.quantile(0.05), x.quantile(0.10), 
                       x.quantile(0.25), x.quantile(0.50), x.quantile(0.75), 
                       x.quantile(0.90), x.quantile(0.95), x.quantile(0.99), x.max()],
                     
                    index = ['dtype', 'cardinality', 'n_tot', 'n', 'n_miss', 'n_miss_perc',
                             'sum', 'var', 'std', 'mean',
                             'iqr', 'lc_iqr', 'uc_iqr',
                             'min', 'p1', 'p5', 'p10', 
                             'p25', 'p50', 'p75', 
                             'p90', 'p95', 'p99', 'max'])

#                                       BUSINESS PROBLEM-1

In [3]:
loan = pd.read_csv("C:/Users/Jayanto Debnath/Downloads/Basic Statistics - Hypothesis Testing/LoansData.csv")
loan.head()

Unnamed: 0,Amount.Requested,Amount.Funded.By.Investors,Interest.Rate,Loan.Length,Loan.Purpose,Debt.To.Income.Ratio,State,Home.Ownership,Monthly.Income,FICO.Range,Open.CREDIT.Lines,Revolving.CREDIT.Balance,Inquiries.in.the.Last.6.Months,Employment.Length
0,20000.0,20000.0,8.90%,36 months,debt_consolidation,14.90%,SC,MORTGAGE,6541.67,735-739,14.0,14272.0,2.0,< 1 year
1,19200.0,19200.0,12.12%,36 months,debt_consolidation,28.36%,TX,MORTGAGE,4583.33,715-719,12.0,11140.0,1.0,2 years
2,35000.0,35000.0,21.98%,60 months,debt_consolidation,23.81%,CA,MORTGAGE,11500.0,690-694,14.0,21977.0,1.0,2 years
3,10000.0,9975.0,9.99%,36 months,debt_consolidation,14.30%,KS,MORTGAGE,3833.33,695-699,10.0,9346.0,0.0,5 years
4,12000.0,12000.0,11.71%,36 months,credit_card,18.78%,NJ,RENT,3195.0,695-699,11.0,14469.0,0.0,9 years


## Data prep

In [4]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Amount.Requested                2499 non-null   float64
 1   Amount.Funded.By.Investors      2499 non-null   float64
 2   Interest.Rate                   2498 non-null   object 
 3   Loan.Length                     2500 non-null   object 
 4   Loan.Purpose                    2500 non-null   object 
 5   Debt.To.Income.Ratio            2499 non-null   object 
 6   State                           2500 non-null   object 
 7   Home.Ownership                  2499 non-null   object 
 8   Monthly.Income                  2499 non-null   float64
 9   FICO.Range                      2498 non-null   object 
 10  Open.CREDIT.Lines               2497 non-null   float64
 11  Revolving.CREDIT.Balance        2497 non-null   float64
 12  Inquiries.in.the.Last.6.Months  24

In [5]:
loan.columns

Index(['Amount.Requested', 'Amount.Funded.By.Investors', 'Interest.Rate',
       'Loan.Length', 'Loan.Purpose', 'Debt.To.Income.Ratio', 'State',
       'Home.Ownership', 'Monthly.Income', 'FICO.Range', 'Open.CREDIT.Lines',
       'Revolving.CREDIT.Balance', 'Inquiries.in.the.Last.6.Months',
       'Employment.Length'],
      dtype='object')

In [6]:
## columns rename
loan.rename( {'Amount.Requested': 'amt_req', 'Amount.Funded.By.Investors': 'amt_funded_by_investors', 'Interest.Rate': 'interest_rate',
       'Loan.Length': 'loan_length_Month', 'Loan.Purpose': 'loan_purpose', 'Debt.To.Income.Ratio': 'debt_to_inc_ratio',
       'Home.Ownership': 'home_ownership', 'Monthly.Income':'monthly_inc', 'FICO.Range':'FICO_Range', 'Open.CREDIT.Lines':'open_REDIT_lines',
       'Revolving.CREDIT.Balance': 'revolving_CREDIT_Bal' , 'Inquiries.in.the.Last.6.Months':'last_6month_inquiries',
       'Employment.Length':'employment_Length'}, axis= 1 , inplace= True)

In [7]:
loan.columns

Index(['amt_req', 'amt_funded_by_investors', 'interest_rate',
       'loan_length_Month', 'loan_purpose', 'debt_to_inc_ratio', 'State',
       'home_ownership', 'monthly_inc', 'FICO_Range', 'open_REDIT_lines',
       'revolving_CREDIT_Bal', 'last_6month_inquiries', 'employment_Length'],
      dtype='object')

In [8]:
## removing % symbol
loan.interest_rate = loan.interest_rate.str.replace("%", '').astype('float')
loan.debt_to_inc_ratio = loan.debt_to_inc_ratio.str.replace("%", '').astype('float')
# loan length col
loan.loan_length_Month = loan.loan_length_Month.str.replace("months", '').astype('float')

In [9]:
## fill null value
loan.isnull().sum()

amt_req                     1
amt_funded_by_investors     1
interest_rate               2
loan_length_Month           0
loan_purpose                0
debt_to_inc_ratio           1
State                       0
home_ownership              1
monthly_inc                 1
FICO_Range                  2
open_REDIT_lines            3
revolving_CREDIT_Bal        3
last_6month_inquiries       3
employment_Length          77
dtype: int64

In [10]:
loan.drop( columns='employment_Length' , inplace= True)
loan.dropna( inplace= True)

In [11]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2489 entries, 0 to 2499
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   amt_req                  2489 non-null   float64
 1   amt_funded_by_investors  2489 non-null   float64
 2   interest_rate            2489 non-null   float64
 3   loan_length_Month        2489 non-null   float64
 4   loan_purpose             2489 non-null   object 
 5   debt_to_inc_ratio        2489 non-null   float64
 6   State                    2489 non-null   object 
 7   home_ownership           2489 non-null   object 
 8   monthly_inc              2489 non-null   float64
 9   FICO_Range               2489 non-null   object 
 10  open_REDIT_lines         2489 non-null   float64
 11  revolving_CREDIT_Bal     2489 non-null   float64
 12  last_6month_inquiries    2489 non-null   float64
dtypes: float64(9), object(4)
memory usage: 272.2+ KB


In [12]:
# seperate categorical and continuous variables
loan_conti_vars = loan.loc[:, (loan.dtypes == 'float64') | (loan.dtypes == 'int64')]
loan_cat_vars = loan.loc[:, (loan.dtypes == 'object')]

In [13]:
loan_cat_vars.describe

<bound method NDFrame.describe of             loan_purpose State home_ownership FICO_Range
0     debt_consolidation    SC       MORTGAGE    735-739
1     debt_consolidation    TX       MORTGAGE    715-719
2     debt_consolidation    CA       MORTGAGE    690-694
3     debt_consolidation    KS       MORTGAGE    695-699
4            credit_card    NJ           RENT    695-699
...                  ...   ...            ...        ...
2495  debt_consolidation    NY       MORTGAGE    705-709
2496    home_improvement    MD            OWN    740-744
2497  debt_consolidation    PA       MORTGAGE    680-684
2498      major_purchase    NJ           RENT    675-679
2499  debt_consolidation    NY           RENT    670-674

[2489 rows x 4 columns]>

In [14]:
loan_conti_vars.apply(fn_descriptives)

Unnamed: 0,amt_req,amt_funded_by_investors,interest_rate,loan_length_Month,debt_to_inc_ratio,monthly_inc,open_REDIT_lines,revolving_CREDIT_Bal,last_6month_inquiries
dtype,float64,float64,float64,float64,float64,float64,float64,float64,float64
cardinality,378,707,273,2,1664,630,29,2342,10
n_tot,2489,2489,2489,2489,2489,2489,2489,2489,2489
n,2489,2489,2489,2489,2489,2489,2489,2489,2489
n_miss,0,0,0,0,0,0,0,0,0
n_miss_perc,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sum,30888300.0,29879833.09,32533.5,102756.0,38286.26,14150580.53,25075.0,37883581.0,2254.0
var,60945301.903335,60039786.026045,17.450548,98.935762,56.357326,15727546.230929,20.3333,334804674.667375,1.517208
std,7806.747204,7748.534444,4.177385,9.946646,7.507152,3965.797049,4.509246,18297.668558,1.23175
mean,12409.923664,12004.754154,13.070912,41.28405,15.382186,5685.2473,10.074327,15220.40217,0.905585


In [15]:
# outlier treatment
for var in loan_conti_vars.columns:
        loan[var] = np.where(loan[var] > loan[var].quantile(0.99) ,
                                    loan[var].quantile(0.99), loan[var])

# 1(a)  Intrest rate is varied for different loan amounts

#### 1 Define H0:
    variables independant

#### 2 Dwefine Ha: 
    variables dependant

#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        

#### 4 perform the test:
     t test: tttest_relative
     stats.pearsonr

In [16]:
stats.pearsonr(loan.interest_rate,loan.amt_funded_by_investors)

PearsonRResult(statistic=0.3341023116568378, pvalue=5.737570334459154e-66)

In [17]:
stats.ttest_rel(loan.interest_rate,loan.amt_funded_by_investors)

Ttest_relResult(statistic=-77.22379032059564, pvalue=0.0)

##### 5 business conclusion:
     for the larger amount of loan interset rate is lower

# (b) Loan length is directly effecting intrest rate

#### 1 Define H0:
    variables independant

#### 2 Dwefine Ha: 
    variables dependant

#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        

#### 4 perform the test:
    stats.spearmanr

In [18]:
stats.spearmanr(loan.interest_rate , loan.loan_length_Month)

SpearmanrResult(correlation=0.3897344194175745, pvalue=4.3727413630439584e-91)

##### business conclusion:
 
    there is a direct corelation of the pre usage with latest month usage

# (c)  Inrest rate varies for different purpose of loans

In [19]:
loan.loan_purpose.unique()

array(['debt_consolidation', 'credit_card', 'other', 'moving', 'car',
       'vacation', 'home_improvement', 'house', 'major_purchase',
       'educational', 'medical', 'wedding', 'small_business',
       'renewable_energy'], dtype=object)

In [20]:
# data prep

# samples for each loan purposes

a1 = loan.loc[ loan.loan_purpose == 'debt_consolidation', 'interest_rate' ]
a2 = loan.loc[ loan.loan_purpose == 'credit_card', 'interest_rate' ]
a3 = loan.loc[ loan.loan_purpose == 'other', 'interest_rate' ]
a4 = loan.loc[ loan.loan_purpose == 'moving', 'interest_rate' ]
a5 = loan.loc[ loan.loan_purpose == 'car', 'interest_rate' ]
a6 = loan.loc[ loan.loan_purpose == 'vacation', 'interest_rate' ]
a7 = loan.loc[ loan.loan_purpose == 'home_improvement', 'interest_rate' ]
a8 = loan.loc[ loan.loan_purpose == 'house', 'interest_rate' ]
a9 = loan.loc[ loan.loan_purpose == 'major_purchase', 'interest_rate' ]
a10 = loan.loc[ loan.loan_purpose == 'educational', 'interest_rate' ]
a11 = loan.loc[ loan.loan_purpose == 'medical', 'interest_rate' ]
a12 = loan.loc[ loan.loan_purpose == 'wedding', 'interest_rate' ]
a13 = loan.loc[ loan.loan_purpose == 'small_business', 'interest_rate' ]
a14 = loan.loc[ loan.loan_purpose == 'renewable_energy', 'interest_rate' ]

##### 1 Define H0:
        samples are from same population

#### 2 Define Ha:
       samples are from different populatiom
#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        
#### 4 perform the test:
      stats.f_oneway

In [21]:
stats.f_oneway(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10,a11,a12,a13,a14)

F_onewayResult(statistic=7.485329679963486, pvalue=1.1463480725465721e-14)

#### business conclusion
     Interest rate for different loan purpose is different

# (d) There is relationship between FICO scores and Home Ownership. It means that, People with owning home will have high FICO scores.


#### data prep fot chi square test

In [22]:
obs = pd.crosstab( loan.home_ownership, loan.FICO_Range )
obs

FICO_Range,640-644,645-649,650-654,655-659,660-664,665-669,670-674,675-679,680-684,685-689,...,780-784,785-789,790-794,795-799,800-804,805-809,810-814,815-819,820-824,830-834
home_ownership,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MORTGAGE,0,1,1,0,41,52,67,79,55,61,...,21,12,15,7,7,6,5,6,0,1
OTHER,0,0,0,1,0,0,1,1,2,0,...,0,0,0,0,0,0,0,0,0,0
OWN,1,0,0,0,17,18,13,11,9,10,...,3,2,1,1,3,2,1,0,1,0
RENT,4,2,0,3,66,75,89,75,91,64,...,4,5,4,5,2,3,2,0,0,0



   
##### 1 Define H0:
      No relationship, independant

#### 2 Define Ha:
       variables are related, dependant
#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        
#### 4 perform the test:

In [23]:
print ('p value =', stats.chi2_contingency( obs )[1])

p value = 8.424183189199129e-17


#### business conclusion
    people who owns a house has high FICO score as compare to people who doesn't.

# BUSINESS PROBLEM 2

### import dataset

In [24]:
price_Quotes = pd.read_csv("C:/Users/Jayanto Debnath/Downloads/Basic Statistics - Hypothesis Testing/Price_Quotes.csv") 
price_Quotes

Unnamed: 0,Order_Number,Barry_Price,Mary_Price
0,1,126,114
1,2,110,118
2,3,138,114
3,4,142,111
4,5,146,129
5,6,136,119
6,7,94,97
7,8,103,104
8,9,140,127
9,10,152,133


In [25]:
price_Quotes.isnull().sum()

Order_Number    0
Barry_Price     0
Mary_Price      0
dtype: int64

## Q.We would like to assess if there is any difference in the average price quotes provided by Mary and Barry

In [26]:
# print the avg/mean of the samples
print( 'mean of Barry price:', round(price_Quotes.Barry_Price.mean(), 1) )
print( 'mean of Mary price:', round(price_Quotes.Mary_Price.mean(), 1) )

mean of Barry price: 124.3
mean of Mary price: 114.8


##### 1 Define H0:
        u2 = u1

#### 2 Define Ha:
       u2 <> u1
#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        
#### 4 perform the test:
    stats.ttest

In [27]:
 stats.ttest_rel( price_Quotes.Barry_Price, price_Quotes.Mary_Price )

Ttest_relResult(statistic=2.5213765108923494, pvalue=0.02840588045242053)

#### business conclusion
    Their is average price difference in Barry and Mary quotes

# BUSINESS PROBLEM 3

### import dataset

In [28]:
treatment = pd.read_csv("C:/Users/Jayanto Debnath/Downloads/Basic Statistics - Hypothesis Testing/Treatment_Facility.csv")
treatment.head()

Unnamed: 0,Month,Reengineer,Employee_Turnover,VAR4,VAR5
0,1,Prior,0.0,24.390244,42.682927
1,2,Prior,6.0606,19.354839,25.806452
2,3,Prior,12.1212,35.087719,146.19883
3,4,Prior,3.3333,18.404908,110.429448
4,5,Prior,12.9032,17.964072,23.952096


In [29]:
 treatment.rename( columns={'VAR4':'TRFF(%)','VAR5':'CI(%)'}, inplace=True)

In [30]:
treatment.isnull().sum()

Month                0
Reengineer           0
Employee_Turnover    0
TRFF(%)              0
CI(%)                0
dtype: int64

## BUSINESS PROBLEM : Determine what effect, if any, the reengineering effort had on the incidence behavioral problems and staff turnover. i.e To determine if the reengineering effort changed the critical incidence rate. Is there evidence that the critical incidence rate improved?

In [31]:
#samples for each Reengineer 
s1 = treatment.loc[ treatment.Reengineer == 'Prior', 'CI(%)' ]
s2 = treatment.loc[ treatment.Reengineer == 'Post', 'CI(%)' ]

# print the avg/mean of three samples
print( 'mean of s1:', round(s1.mean(), 1), 
          '| mean of s2:', round(s2.mean(), 1))

mean of s1: 53.9 | mean of s2: 23.3


##### 1 Define H0:
         u2 <= u1

#### 2 Define Ha:
       u2 > u1
#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        
#### 4 perform the test:

In [32]:
print( stats.ttest_ind( s1, s2 ) )

Ttest_indResult(statistic=1.627914425352865, pvalue=0.12091989189884148)


#### business conclusion
      WIth the give data we can not say that their is any effect on the reengineering efforts

# BUSINESS PROBLEM 4

In [33]:
priority_assessment = pd.read_csv("C:/Users/Jayanto Debnath/Downloads/Basic Statistics - Hypothesis Testing/Priority_Assessment.csv")
priority_assessment

Unnamed: 0,Days,Priority
0,3.3,High
1,7.9,Medium
2,0.3,High
3,0.7,Medium
4,8.6,Medium
...,...,...
637,2.5,Low
638,0.3,High
639,0.3,Medium
640,1.3,Medium


In [35]:
priority_assessment.isnull().sum()

Days        0
Priority    0
dtype: int64

## BUSINESS PROBLEM: We will focus on the prioritization system. If the system is working, thenhigh priority jobs, on average, should be completed more quickly than medium priority jobs,and medium priority jobs should be completed more quickly than low priority jobs. Use thedata provided to determine whether thisis, in fact, occurring

#### 1 Define H0:
    variables independant

#### 2 Dwefine Ha: 
    variables dependant

#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        

#### 4 perform the test:

In [36]:
stats.spearmanr( priority_assessment.Days , priority_assessment.Priority)



SpearmanrResult(correlation=0.017913791551236326, pvalue=0.6505153120156153)

#### business conclusion:
    With the given data we cannot say that the prioritization system is working.

# BUSINESS PROBLEM 5

In [39]:
films = pd.read_csv("C:/Users/Jayanto Debnath/Downloads/Basic Statistics - Hypothesis Testing/Films.csv")
films

Unnamed: 0,_rowstate_,Movie,Gender,Marital_Status,Sinage,Parking,Clean,Overall,Age,Income,Hear_About
0,0,Ferris Buellers Day Off,Female,Married,2.0,2.0,2.0,2.0,3.0,1.0,5
1,0,Ferris Buellers Day Off,Female,Single,1.0,1.0,1.0,1.0,2.0,1.0,5
2,0,Ferris Buellers Day Off,Male,Married,2.0,4.0,3.0,2.0,4.0,1.0,5
3,0,Ferris Buellers Day Off,Female,Married,1.0,3.0,2.0,2.0,4.0,1.0,5
4,0,Ferris Buellers Day Off,Female,Married,1.0,1.0,1.0,1.0,3.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...
325,0,Old School,2,2,1.0,2.0,1.0,1.0,2.0,1.0,1
326,0,Old School,1,1,2.0,2.0,2.0,2.0,3.0,3.0,5
327,0,Old School,2,1,2.0,1.0,1.0,2.0,2.0,2.0,5
328,0,Old School,1,1,1.0,1.0,1.0,1.0,2.0,1.0,3


In [40]:
films.isnull().sum()

_rowstate_         0
Movie              0
Gender             0
Marital_Status     2
Sinage             2
Parking            2
Clean              3
Overall            2
Age                2
Income            16
Hear_About         7
dtype: int64

In [41]:
for var in films.columns:
    films[var].fillna(value = films[var].mode()[0], inplace=True)

In [42]:
def gender (x):
    if x.Gender == 'Male' :
        return '1'
    elif x.Gender == 'Female'  :
        return '0'
    elif x.Gender == '1'  :
        return '1'
    else :
        return '0'

In [43]:
#converting males to 1 and females to 0 
films.Gender = films.apply(gender, axis=1)

In [44]:
def married_status (x):
    if x.Marital_Status == 'Married' :
        return '1'
    elif x.Marital_Status == 'Single'  :
        return '0'
    elif x.Marital_Status == '1'  :
        return '1'
    else :
        return '0'

In [45]:
#converting married to 1 and singles to 0
films.Marital_Status = films.apply(married_status, axis=1)

## Q.1 What is the overall level of customer satisfaction

In [46]:
overall = films.groupby('Overall')[['Overall']].count()
overall = overall.rename( columns={'Overall': 'count'}).reset_index()
overall =overall.rename( columns={'Overall': 'Satisfaction level'})
overall['Prob'] = overall['count'] / overall['count'].sum()
print(overall)
print('  As Satisfaction level 1 is Excellent and 2 is Good. That means their is 94% chance customers are satisfied. ')

   Satisfaction level  count      Prob
0                 1.0    151  0.457576
1                 2.0    162  0.490909
2                 3.0     12  0.036364
3                 4.0      1  0.003030
4                 5.0      4  0.012121
  As Satisfaction level 1 is Excellent and 2 is Good. That means their is 94% chance customers are satisfied. 


## Q.2 What factors are linked to satisfaction

In [47]:
# data prep for chi square test
obs = pd.crosstab( films.Sinage,films.Overall )



#### Ho
    No relationship, independant
#### Ha: 
    variables are related, dependant
   
#### CI:
      95%
#### p:
      0.05
  


#### perform the test

In [48]:

stats.chi2_contingency( obs )

(125.61859671027881,
 4.53298970003405e-19,
 16,
 array([[5.17060606e+01, 5.54727273e+01, 4.10909091e+00, 3.42424242e-01,
         1.36969697e+00],
        [7.22969697e+01, 7.75636364e+01, 5.74545455e+00, 4.78787879e-01,
         1.91515152e+00],
        [2.19636364e+01, 2.35636364e+01, 1.74545455e+00, 1.45454545e-01,
         5.81818182e-01],
        [2.74545455e+00, 2.94545455e+00, 2.18181818e-01, 1.81818182e-02,
         7.27272727e-02],
        [2.28787879e+00, 2.45454545e+00, 1.81818182e-01, 1.51515152e-02,
         6.06060606e-02]]))

#### business conclusion
    with increase in sinage rating overall staifaction is also increases

In [50]:
# data prep for chi square test
obs = pd.crosstab( films.Parking,films.Overall )

#### 1 Define H0:
    variables independant

#### 2 Dwefine Ha: 
    variables dependant

#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        

#### 4 perform the test:

In [51]:
stats.chi2_contingency( obs )

(230.61691663391676,
 4.783162283608494e-40,
 16,
 array([[6.17727273e+01, 6.62727273e+01, 4.90909091e+00, 4.09090909e-01,
         1.63636364e+00],
        [7.09242424e+01, 7.60909091e+01, 5.63636364e+00, 4.69696970e-01,
         1.87878788e+00],
        [1.32696970e+01, 1.42363636e+01, 1.05454545e+00, 8.78787879e-02,
         3.51515152e-01],
        [3.20303030e+00, 3.43636364e+00, 2.54545455e-01, 2.12121212e-02,
         8.48484848e-02],
        [1.83030303e+00, 1.96363636e+00, 1.45454545e-01, 1.21212121e-02,
         4.84848485e-02]]))

#### business conclusion
    with increase in parking rating overall staifaction is also increases

In [52]:
# data prep for chi square test
obs = pd.crosstab( films.Clean,films.Overall )

#### 1 Define H0:
    variables independant

#### 2 Dwefine Ha: 
    variables dependant

#### 3. Define ci & pvalue:¶
    ci = 95%
    p_val = 0.05
    signi. val =  1.960        

#### 4 perform the test

In [53]:
stats.chi2_contingency( obs )


(127.71945555999517,
 1.7772535915434025e-19,
 16,
 array([[6.58909091e+01, 7.06909091e+01, 5.23636364e+00, 4.36363636e-01,
         1.74545455e+00],
        [7.09242424e+01, 7.60909091e+01, 5.63636364e+00, 4.69696970e-01,
         1.87878788e+00],
        [1.09818182e+01, 1.17818182e+01, 8.72727273e-01, 7.27272727e-02,
         2.90909091e-01],
        [2.28787879e+00, 2.45454545e+00, 1.81818182e-01, 1.51515152e-02,
         6.06060606e-02],
        [9.15151515e-01, 9.81818182e-01, 7.27272727e-02, 6.06060606e-03,
         2.42424242e-02]]))

#### business conclusion
    with increase in clean rating overall staifaction is also increases

## Q.3 What is the demographic profile of Film on the Rocks patrons

In [54]:
#Demography on Martial Status
Marital_Status = films.groupby('Marital_Status')[['Marital_Status']].count()
Marital_Status = Marital_Status.rename( columns={'Marital_Status': 'count'}).reset_index()
Marital_Status['Prob'] = Marital_Status['count'] / Marital_Status['count'].sum()
print(Marital_Status)
print('  Their is 69% probability that the audiance is Married')

  Marital_Status  count     Prob
0              0    230  0.69697
1              1    100  0.30303
  Their is 69% probability that the audiance is Married


In [55]:
#Demography on Martial Status
Age = films.groupby('Age')[['Age']].count()
Age = Age.rename( columns={'Age': 'count'}).reset_index()
Age['Prob'] = Age['count'] / Age['count'].sum()
print(Age)
print('  Their is 53% probability that the age of the audiance is between 13-30')

   Age  count      Prob
0  1.0     26  0.078788
1  2.0    177  0.536364
2  3.0    117  0.354545
3  4.0     10  0.030303
  Their is 53% probability that the age of the audiance is between 13-30


In [57]:
#Demography on Income
Income = films.groupby('Income')[['Income']].count()
Income = Income.rename( columns={'Income': 'count'}).reset_index()
Income['Prob'] = Income['count'] / Income['count'].sum()
print(Income)
print('  Their is 47% probability that the Income of the audiance is less than 50K') 

   Income  count      Prob
0     1.0    158  0.478788
1     2.0     82  0.248485
2     3.0     90  0.272727
  Their is 47% probability that the Income of the audiance is less than 50K


## Q.4 In what media outlet(s) should the film series be advertised

In [58]:
#Demography on Income
Hear_About = films.groupby('Hear_About')[['Hear_About']].count()
Hear_About = Hear_About.rename( columns={'Hear_About': 'count'}).reset_index()
Hear_About['Prob'] = Hear_About['count'] / Hear_About['count'].sum()
print(Hear_About.sort_values(by='Prob', ascending = False))
print('  Their is 70% probability that the audiance heard about the film series solely through word of mouth.')

   Hear_About  count      Prob
9           5    233  0.706061
7           4     41  0.124242
0           1     22  0.066667
4           3     14  0.042424
2           2     12  0.036364
3         2,5      2  0.006061
5         3,4      2  0.006061
1         1,5      1  0.003030
6         3,5      1  0.003030
8         4,5      1  0.003030
10        5,4      1  0.003030
  Their is 70% probability that the audiance heard about the film series solely through word of mouth.
