In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline


In [2]:
file_path="C://project//4. Basic Statistics - Hypothesis Testing//"
file_name="LoansData.csv"
loandata=pd.read_csv(file_path+file_name)
loandata=loandata.dropna()
loandata["Interest.Rate"]=pd.to_numeric(loandata["Interest.Rate"].str.rstrip("%"),errors="coerce")
loandata["Loan.Length"]=pd.to_numeric(loandata["Loan.Length"].str.rstrip("months"))

In [3]:
stats.pearsonr(loandata["Interest.Rate"],loandata["Amount.Funded.By.Investors"])

PearsonRResult(statistic=0.33159553110364703, pvalue=5.064256950487693e-63)

In [4]:
stats.pearsonr(loandata["Loan.Length"],loandata["Interest.Rate"])

PearsonRResult(statistic=0.4250573823094828, pvalue=1.7938010673273293e-106)

In [5]:
group_data=loandata.groupby(loandata["Loan.Purpose"])['Interest.Rate'].apply(list)
stats.f_oneway(*group_data)
#another method
loan_purpose=loandata["Loan.Purpose"].unique()
group=[loandata[loandata["Loan.Purpose"]==purpose]["Interest.Rate"] for purpose in loan_purpose]
statistic, p_value=stats.f_oneway(*group)
print("p_value:",p_value)
if p_value < 0.05:
    print("Reject the null hypothesis. There is a significant difference in interest rates among different loan purposes.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference in interest rates among different loan purposes.")

p_value: 2.7646672581411367e-14
Reject the null hypothesis. There is a significant difference in interest rates among different loan purposes.


In [6]:
fico_split=loandata["FICO.Range"].str.split("-",expand=True)
fico_split=fico_split.astype(str)
loandata["FICO combined"]=fico_split[0]+fico_split[1]
loandata["FICO combined"]=loandata["FICO combined"].astype("int64")
group_data=loandata.groupby(loandata["Home.Ownership"])["FICO combined"].apply(list)
f_statistic, p_value = stats.f_oneway(*group_data)
print(p_value)
if p_value < 0.05:
    print("Reject the null hypothesis. There is a significant difference in FICO combined scores among different home ownership categories.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference in FICO combined scores among different home ownership categories.")

3.2682005447315283e-13
Reject the null hypothesis. There is a significant difference in FICO combined scores among different home ownership categories.


In [7]:
file_name="Price_Quotes.csv"
price_quotes=pd.read_csv(file_path+file_name)
price_quotes

Unnamed: 0,Order_Number,Barry_Price,Mary_Price
0,1,126,114
1,2,110,118
2,3,138,114
3,4,142,111
4,5,146,129
5,6,136,119
6,7,94,97
7,8,103,104
8,9,140,127
9,10,152,133


In [8]:
t_statistic,p_value=stats.ttest_ind(price_quotes["Barry_Price"],price_quotes["Mary_Price"])
print("p_value:",p_value)
if p_value<0.05:
    print("Reject the null hypothesis. There is a significant difference in the average price quotes between Barry and Mary.")
else:
    print("Fail to reject the null hypothesis. There is no significant difference in the average price quotes between Barry and Mary.")

p_value: 0.17114226132118285
Fail to reject the null hypothesis. There is no significant difference in the average price quotes between Barry and Mary.


In [9]:
file_name="Treatment_Facility.csv"
treatment_facility=pd.read_csv(file_path+file_name)
treatment_facility

Unnamed: 0,Month,Reengineer,Employee_Turnover,VAR4,VAR5
0,1,Prior,0.0,24.390244,42.682927
1,2,Prior,6.0606,19.354839,25.806452
2,3,Prior,12.1212,35.087719,146.19883
3,4,Prior,3.3333,18.404908,110.429448
4,5,Prior,12.9032,17.964072,23.952096
5,6,Prior,9.6774,41.176471,47.058824
6,7,Prior,11.7647,13.422819,0.0
7,8,Prior,11.4286,31.25,25.0
8,9,Prior,23.0769,17.241379,132.183908
9,10,Prior,15.0,16.574586,16.574586


In [10]:
df=pd.crosstab(treatment_facility["Reengineer"],treatment_facility["VAR5"])
chi2_stat, p_val, dof, expected=stats.chi2_contingency(df)
pre=treatment_facility[treatment_facility["Reengineer"]=="Prior"]["Employee_Turnover"]
post=treatment_facility[treatment_facility["Reengineer"]=="Post"]["Employee_Turnover"]
t_stat,p_turnover=stats.ttest_ind(pre,post)
print(f'Chi-square test for Critical Incidents: p-value = {p_val}')
print(f'T-test for Staff Turnover: p-value = {p_turnover}')
if p_val < 0.05:
    print('There is evidence that the critical incident rate changed after reengineering.')
else:
    print('There is no significant evidence that the critical incident rate changed after reengineering.')

if p_turnover < 0.05:
    print('There is evidence that staff turnover changed after reengineering.')
else:
    print('There is no significant evidence that staff turnover changed after reengineering.')

Chi-square test for Critical Incidents: p-value = 0.394578182086001
T-test for Staff Turnover: p-value = 0.09361109345535304
There is no significant evidence that the critical incident rate changed after reengineering.
There is no significant evidence that staff turnover changed after reengineering.


In [11]:
file_name="Priority_Assessment.csv"
priority_assessment=pd.read_csv(file_path+file_name)
priority_assessment

Unnamed: 0,Days,Priority
0,3.3,High
1,7.9,Medium
2,0.3,High
3,0.7,Medium
4,8.6,Medium
...,...,...
637,2.5,Low
638,0.3,High
639,0.3,Medium
640,1.3,Medium


In [12]:
high=priority_assessment[priority_assessment["Priority"]=="High"]["Days"]
medium=priority_assessment[priority_assessment["Priority"]=="Medium"]["Days"]
low=priority_assessment[priority_assessment["Priority"]=="Low"]["Days"]
statistic,p_value=stats.f_oneway(high,medium,low)
print("p_value",p_value)
if p_value<0.05:
    print("The prioritization system is effective. There are significant differences in completion times among different priority levels.")
else:
    print("The prioritization system does not show significant differences in completion times among different priority levels.")


p_value 0.16411459461716182
The prioritization system does not show significant differences in completion times among different priority levels.


In [13]:
file_name="Films.csv"
films=pd.read_csv(file_path+file_name)


In [14]:
films['Satisfaction_Level'] = films['Overall'].apply(lambda x: 'Excellent' if x == 1 else 'Good' if x == 2 else 'Average' if x == 3 else 'Poor' if x == 4 else 'Very Poor')
films=films.dropna()
films=films.reset_index(drop=True)
films=films.drop(columns=["_rowstate_"])

In [15]:
gender_mapping = {'1':"Male", '2':"Female", 'Male':"Male", 'Female':"Female"}
films['Gender'] = films['Gender'].map(gender_mapping)
gender_mapping = {'Male':1, 'Female':2}
films["gender"]=films['Gender'].map(gender_mapping).astype("float64")
marital_mapping = {'1':"Married", '2':"Single", 'Married':"Married", 'Single':"Single"}
films['Marital_Status']=films['Marital_Status'].map(marital_mapping)
marital_mapping = {'Married':1, 'Single':2}
films["marital_status"]=films["Marital_Status"].map(marital_mapping).astype("float64")
films.drop(columns=["Gender","Marital_Status"])

Unnamed: 0,Movie,Sinage,Parking,Clean,Overall,Age,Income,Hear_About,Satisfaction_Level,gender,marital_status
0,Ferris Buellers Day Off,2.0,2.0,2.0,2.0,3.0,1.0,5,Good,2.0,1.0
1,Ferris Buellers Day Off,1.0,1.0,1.0,1.0,2.0,1.0,5,Excellent,2.0,2.0
2,Ferris Buellers Day Off,2.0,4.0,3.0,2.0,4.0,1.0,5,Good,1.0,1.0
3,Ferris Buellers Day Off,1.0,3.0,2.0,2.0,4.0,1.0,5,Good,2.0,1.0
4,Ferris Buellers Day Off,1.0,1.0,1.0,1.0,3.0,3.0,1,Excellent,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
296,Old School,1.0,2.0,1.0,1.0,2.0,1.0,1,Excellent,2.0,2.0
297,Old School,2.0,2.0,2.0,2.0,3.0,3.0,5,Good,1.0,1.0
298,Old School,2.0,1.0,1.0,2.0,2.0,2.0,5,Good,2.0,1.0
299,Old School,1.0,1.0,1.0,1.0,2.0,1.0,3,Excellent,1.0,1.0


In [23]:
satisfaction_columns = ['Sinage', 'Parking', 'Clean', 'Overall']
average_satisfaction = films[satisfaction_columns].mean()
average_satisfaction

Sinage     1.890365
Parking    1.777409
Clean      1.681063
Overall    1.621262
dtype: float64

In [67]:
null_value = 2
for column in satisfaction_columns:
    t_stat, p_value = stats.ttest_1samp(films[column], null_value)
    
    if p_value < 0.05:
        print(f'{column}: Evidence to reject the null hypothesis (p-value = {p_value}). Average satisfaction score is significantly different from {null_value}.')
    else:
        print(f'{column}: Insufficient evidence to reject the null hypothesis (p-value = {p_value}). Average satisfaction score is not significantly different from {null_value}.')

Sinage: Evidence to reject the null hypothesis (p-value = 0.025506494786433038). Average satisfaction score is significantly different from 2.
Parking: Evidence to reject the null hypothesis (p-value = 3.481623251781527e-06). Average satisfaction score is significantly different from 2.
Clean: Evidence to reject the null hypothesis (p-value = 7.361439626201562e-13). Average satisfaction score is significantly different from 2.
Overall: Evidence to reject the null hypothesis (p-value = 1.2647847846413057e-19). Average satisfaction score is significantly different from 2.


In [24]:
correlation_matrix = films[satisfaction_columns + [ 'gender','marital_status','Age', 'Income']].corr()
correlation_matrix

Unnamed: 0,Sinage,Parking,Clean,Overall,gender,marital_status,Age,Income
Sinage,1.0,0.470412,0.365028,0.410689,-0.103779,0.10801,-0.091722,-0.029748
Parking,0.470412,1.0,0.457018,0.547717,-0.081853,-0.065691,0.00164,-0.033455
Clean,0.365028,0.457018,1.0,0.385523,-0.073708,-0.006002,-0.000745,0.007355
Overall,0.410689,0.547717,0.385523,1.0,-0.042774,0.03885,-0.023926,-0.014984
gender,-0.103779,-0.081853,-0.073708,-0.042774,1.0,0.056814,-0.051682,-0.043455
marital_status,0.10801,-0.065691,-0.006002,0.03885,0.056814,1.0,-0.569448,-0.382013
Age,-0.091722,0.00164,-0.000745,-0.023926,-0.051682,-0.569448,1.0,0.155053
Income,-0.029748,-0.033455,0.007355,-0.014984,-0.043455,-0.382013,0.155053,1.0


In [77]:
demographic_variables = ['gender', 'marital_status', 'Age', 'Income']
for demographic_variable in demographic_variables:
    contingency_table = pd.crosstab(films[demographic_variable], films['Overall'])
    chi2_stat, p_value,dof,expected = stats.chi2_contingency(contingency_table)
    
    if p_value < 0.05:
        print(f'{demographic_variable}: Evidence to reject the null hypothesis (p-value = {p_value}). There is a significant association with overall satisfaction.')
    else:
        print(f'{demographic_variable}: Insufficient evidence to reject the null hypothesis (p-value = {p_value}). There is no significant association with overall satisfaction.')

gender: Insufficient evidence to reject the null hypothesis (p-value = 0.4746946769990794). There is no significant association with overall satisfaction.
marital_status: Insufficient evidence to reject the null hypothesis (p-value = 0.3154104323642798). There is no significant association with overall satisfaction.
Age: Insufficient evidence to reject the null hypothesis (p-value = 0.5267913370930692). There is no significant association with overall satisfaction.
Income: Insufficient evidence to reject the null hypothesis (p-value = 0.7631326888324932). There is no significant association with overall satisfaction.


In [33]:
demographic_distribution = films[['gender', 'marital_status', 'Age', 'Income']].value_counts()
print(demographic_distribution)


gender  marital_status  Age  Income
2.0     2.0             2.0  1.0       61
1.0     2.0             2.0  1.0       30
2.0     1.0             3.0  3.0       22
                             2.0       19
        2.0             2.0  2.0       17
1.0     1.0             3.0  3.0       17
2.0     2.0             3.0  1.0       17
                        2.0  3.0       17
1.0     2.0             2.0  2.0       14
2.0     2.0             1.0  3.0       10
1.0     2.0             2.0  3.0        9
        1.0             3.0  2.0        7
        2.0             3.0  2.0        7
2.0     2.0             3.0  2.0        6
1.0     2.0             3.0  1.0        4
2.0     2.0             1.0  1.0        4
        1.0             3.0  1.0        4
1.0     1.0             2.0  1.0        4
        2.0             1.0  1.0        3
2.0     1.0             2.0  1.0        3
        2.0             1.0  2.0        3
                        3.0  3.0        2
        1.0             2.0  2.0        

In [35]:
media_outlet_distribution = films['Hear_About'].value_counts()
media_outlet_distribution

Hear_About
5      212
4       39
1       19
3       13
2       11
2,5      2
4,5      1
1,5      1
5,4      1
3,5      1
3,4      1
Name: count, dtype: int64

In [86]:
significance_level=0.05
contingency_table=pd.crosstab(films["Hear_About"],films["Overall"])
chi2_stat, p_value,dof,expected = stats.chi2_contingency(contingency_table)
if p_value < significance_level:
    print(f'Reject null hypothesis: "Hear_About" is significantly associated with "Overall" satisfaction (p-value = {p_value}) at {significance_level} significance level.')
else:
    print(f'Fail to reject null hypothesis: No significant association between "Hear_About" and "Overall" satisfaction (p-value = {p_value}) at {significance_level} significance level.')

Fail to reject null hypothesis: No significant association between "Hear_About" and "Overall" satisfaction (p-value = 0.9920532791201511) at 0.05 significance level.
