# Statistics for Decision Making #

In [3]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [20]:
property = pd.read_csv(r"D:\Python\Project on Statistics and Decision making\property.csv")

In [5]:
property.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [6]:
property.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [7]:
# Check for missing values in the entire DataFrame
missing_values = property.isnull().sum()

In [8]:
print("Missing values per column:")
print(missing_values)

Missing values per column:
Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64


In [14]:
# Select only numeric columns
numeric_columns = property.select_dtypes(include=['float64', 'int64'])

In [16]:
# Fill missing values with the mean of each numeric column
property_filled = numeric_columns.fillna(numeric_columns.mean())

In [17]:
# Print the DataFrame after filling missing values
print("DataFrame after filling missing values:")
print(property_filled)

DataFrame after filling missing values:
       Rooms      Price  Distance  Postcode  Bedroom2  Bathroom  Car  \
0          2  1480000.0       2.5    3067.0       2.0       1.0  1.0   
1          2  1035000.0       2.5    3067.0       2.0       1.0  0.0   
2          3  1465000.0       2.5    3067.0       3.0       2.0  0.0   
3          3   850000.0       2.5    3067.0       3.0       2.0  1.0   
4          4  1600000.0       2.5    3067.0       3.0       1.0  2.0   
...      ...        ...       ...       ...       ...       ...  ...   
13575      4  1245000.0      16.7    3150.0       4.0       2.0  2.0   
13576      3  1031000.0       6.8    3016.0       3.0       2.0  2.0   
13577      3  1170000.0       6.8    3016.0       3.0       2.0  4.0   
13578      4  2500000.0       6.8    3016.0       4.0       1.0  5.0   
13579      4  1285000.0       6.3    3013.0       4.0       1.0  1.0   

       Landsize  BuildingArea    YearBuilt  Lattitude  Longtitude  \
0         202.0     151.96

##### Filtering the DataFrame to include only numeric columns before filling missing values ensures that the filling operation is applied only to appropriate columns, which can help in successfully filling missing values #####

### Question 1: Testing Assumption for Altona Suburb

In [19]:
import pandas as pd
from scipy.stats import ttest_1samp

In [23]:
# Filter data for properties in Altona suburb
altona_data = property[property['Suburb'] == 'Altona']

In [24]:
# Assumed typical property price
assumed_price = 800000

In [25]:
# Perform one-sample t-test
t_statistic, p_value = ttest_1samp(altona_data['Price'], assumed_price)

In [26]:
#  To check if the p-value is less than significance level (0.05)
if p_value < 0.05:
    print("The typical property price in Altona is significantly different from $800,000.")
else:
    print("There is no significant difference in the typical property price in Altona.")

There is no significant difference in the typical property price in Altona.


##### The p-value is greater than or equal to 0.05, we fail to reject the null hypothesis, indicating that there is no significant difference in the typical property price in Altona #####

### Question 2: Comparison of Property Prices in Summer and Winter Months

In [30]:
# Convert 'Date' column to datetime format with correct date format
property['Date'] = pd.to_datetime(property['Date'], format='%d/%m/%Y')


In [32]:
# Extract month from dates
property['Month'] = property['Date'].dt.month

In [33]:
# Categorizing months as summer or winter
property['Season'] = property['Month'].apply(lambda x: 'Winter' if x in [10, 11, 12, 1, 2, 3] else 'Summer')

In [34]:
# Filter data for properties sold in 2016
data_2016 = property[property['Date'].dt.year == 2016]

In [35]:
# Separate data for summer and winter months
summer_data = data_2016[data_2016['Season'] == 'Summer']
winter_data = data_2016[data_2016['Season'] == 'Winter']


In [39]:
from scipy.stats import ttest_ind


In [40]:
# Perform two-sample t-test
t_statistic, p_value = ttest_ind(summer_data['Price'], winter_data['Price'])

In [41]:
#  To check if p-value is less than significance level (0.05)
if p_value < 0.05:
    print("There is a significant difference in property prices between summer and winter months.")
else:
    print("There is no significant difference in property prices between summer and winter months.")

There is a significant difference in property prices between summer and winter months.


##### The p-value is less than 0.05, we reject the null hypothesis and conclude that there is a significant difference in property prices between summer and winter months #####

### Question 3: Probability of Properties Without Car Parking in Abbotsford

In [42]:
# Filter data for properties in Abbotsford suburb
abbotsford_data = property[property['Suburb'] == 'Abbotsford']

##### Calculate Probability of Properties Without Car Parking

In [43]:
# Total number of properties sold in Abbotsford
total_properties = len(abbotsford_data)

In [44]:
# Number of properties without car parking
properties_without_parking = len(abbotsford_data[abbotsford_data['Car'] == 0])

In [45]:
# Probability of properties without car parking
probability_no_parking = properties_without_parking / total_properties

In [46]:
# Round off the probability to 3 decimal places
probability_no_parking = round(probability_no_parking, 3)

In [47]:
# Print the probability
print("Probability of a property in Abbotsford not having a car parking:", probability_no_parking)

Probability of a property in Abbotsford not having a car parking: 0.268


##### The calculated probability represents the likelihood of randomly selecting a property in Abbotsford that does not have a car parking space.

### Question 4: Probability of Finding Properties with 3 Rooms in Abbotsford

##### To calculate Probability of Properties with 3 Rooms

In [50]:
# Total number of properties sold in Abbotsford
total_properties = len(abbotsford_data)

In [51]:
# Number of properties with 3 rooms
properties_with_3_rooms = len(abbotsford_data[abbotsford_data['Rooms'] == 3])

In [52]:
# Probability of properties with 3 rooms
probability_3_rooms = properties_with_3_rooms / total_properties

In [64]:
# Round off the probability to 3 decimal places
probability_3_rooms = round(probability_3_rooms, 3)

In [54]:
# Print the probability
print("Probability of finding a property with 3 rooms in Abbotsford:", probability_3_rooms)

Probability of finding a property with 3 rooms in Abbotsford: 0.357


##### The calculated probability represents the likelihood of randomly selecting a property in Abbotsford that has exactly 3 rooms.

### Question 5: Probability of Finding Properties with 2 Bathrooms in Abbotsford

#####  To calculate Probability of Properties with 2 Bathrooms

In [55]:
# Total number of properties sold in Abbotsford
total_properties = len(abbotsford_data)

In [58]:
# Number of properties with 2 bathrooms
properties_with_2_bathrooms = len(abbotsford_data[abbotsford_data['Bathroom'] == 2])

In [60]:
# Probability of properties with 2 bathrooms
probability_2_bathrooms = properties_with_2_bathrooms / total_properties

In [62]:
# Round off the probability to 3 decimal places
probability_2_bathrooms = round(probability_2_bathrooms, 3)

In [63]:
# Print the probability
print("Probability of finding a property with 2 bathrooms in Abbotsford:", probability_2_bathrooms)

Probability of finding a property with 2 bathrooms in Abbotsford: 0.339


##### The calculated probability represents the likelihood of randomly selecting a property in Abbotsford that has exactly 2 bathrooms.