# a) Creation and Loading different types of datasets in Python using the required libraries.
**i. Creation using pandas**

In [2]:
import pandas as pd
data = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data)
df

Unnamed: 0,col1,col2
0,1,3
1,2,4


**ii. Loading CSV dataset files using Pandas**

In [5]:
import pandas as pd
df_csv = pd.read_csv('/content/sample_data/california_housing_test.csv')
print("DataFrame loaded from CSV:")
print(df_csv.head())

DataFrame loaded from CSV:
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
3    -118.36     33.82                28.0         67.0            15.0   
4    -119.67     36.33                19.0       1241.0           244.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
3        49.0        11.0         6.1359            330000.0  
4       850.0       237.0         2.9375             81700.0  


**iii. Loading datasets using sklearn**

In [7]:
from sklearn.datasets import load_iris

iris = load_iris()
X, y = iris.data, iris.target
print("\nDataset loaded using sklearn (Iris dataset):")
print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)
print("Features sample:\n", X[:5])
print("Target sample:\n", y[:5])


Dataset loaded using sklearn (Iris dataset):
Features (X) shape: (150, 4)
Target (y) shape: (150,)
Features sample:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
Target sample:
 [0 0 0 0 0]


**b) Write a python program to compute Mean, Median, Mode, Variance, Standard Deviation using Datasets**

In [9]:
import pandas as pd
import numpy as np
from scipy import stats
def compute_stats(series):
    print(f"\nStatistics for column: {series.name}")
    print(f"  Mean: {series.mean()}")
    print(f"  Median: {series.median()}")
    mode_result = stats.mode(series, keepdims=False)
    if isinstance(mode_result.mode, np.ndarray):
      print(f"  Mode: {mode_result.mode}")
    else:
      print(f"  Mode: {mode_result.mode}")

    print(f"  Variance: {series.var()}")
    print(f"  Standard Deviation: {series.std()}")

print("\nComputing statistics for the pandas DataFrame created from dictionary:")
for col in df.columns:
    compute_stats(df[col])

print("\nComputing statistics for a column from the loaded CSV DataFrame (median_income):")
if 'median_income' in df_csv.columns:
    compute_stats(df_csv['median_income'])
else:
    print("Column 'median_income' not found in the CSV DataFrame.")

print("\nComputing statistics for the first feature column of the Iris dataset:")
iris_feature_series = pd.Series(X[:, 0], name="sepal_length")
compute_stats(iris_feature_series)


Computing statistics for the pandas DataFrame created from dictionary:

Statistics for column: col1
  Mean: 1.5
  Median: 1.5
  Mode: 1
  Variance: 0.5
  Standard Deviation: 0.7071067811865476

Statistics for column: col2
  Mean: 3.5
  Median: 3.5
  Mode: 3
  Variance: 0.5
  Standard Deviation: 0.7071067811865476

Computing statistics for a column from the loaded CSV DataFrame (median_income):

Statistics for column: median_income
  Mean: 3.8072717999999997
  Median: 3.4871499999999997
  Mode: 15.0001
  Variance: 3.4392137555632885
  Standard Deviation: 1.854511729691481

Computing statistics for the first feature column of the Iris dataset:

Statistics for column: sepal_length
  Mean: 5.843333333333334
  Median: 5.8
  Mode: 5.0
  Variance: 0.6856935123042505
  Standard Deviation: 0.8280661279778629


**c) Demonstrate various data pre-processing techniques for a given dataset. Write a python program to compute**
i. Reshaping the data,\
ii. Filtering the data,\
iii. Merging the data\
iv. Handling the missing values in datasets\
v. Feature Normalization: Min-max normalization

In [10]:
import pandas as pd
import numpy as np

# i. Reshaping the data
df_reshaped = df.melt(id_vars=['col1'], value_vars=['col2'], var_name='variable', value_name='value')
print("\nReshaped DataFrame:")
print(df_reshaped)

# ii. Filtering the data
df_filtered = df_csv[df_csv['total_rooms'] > 100]
print("\nFiltered DataFrame (total_rooms > 100):")
print(df_filtered.head())

# iii. Merging the data
data2 = {'col1': [1, 2], 'col3': ['A', 'B']}
df2 = pd.DataFrame(data2)
df_merged = pd.merge(df, df2, on='col1', how='inner')
print("\nMerged DataFrame:")
print(df_merged)

# iv. Handling the missing values in datasets
df_missing = df_csv.copy()
df_missing.loc[0:5, 'total_bedrooms'] = np.nan
print("\nDataFrame with missing values (total_bedrooms):")
print(df_missing.head())

df_filled = df_missing.fillna(df_missing['total_bedrooms'].mean())
print("\nDataFrame after filling missing values with mean:")
print(df_filled.head())

# v. Feature Normalization: Min-max normalization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_scaled = df_csv.copy()
df_scaled[['median_income', 'housing_median_age']] = scaler.fit_transform(df_scaled[['median_income', 'housing_median_age']])
print("\nDataFrame after Min-Max Normalization on median_income and housing_median_age:")
print(df_scaled.head())


Reshaped DataFrame:
   col1 variable  value
0     1     col2      3
1     2     col2      4

Filtered DataFrame (total_rooms > 100):
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.05     37.37                27.0       3885.0           661.0   
1    -118.30     34.26                43.0       1510.0           310.0   
2    -117.81     33.78                27.0       3589.0           507.0   
4    -119.67     36.33                19.0       1241.0           244.0   
5    -119.56     36.51                37.0       1018.0           213.0   

   population  households  median_income  median_house_value  
0      1537.0       606.0         6.6085            344700.0  
1       809.0       277.0         3.5990            176500.0  
2      1484.0       495.0         5.7934            270500.0  
4       850.0       237.0         2.9375             81700.0  
5       663.0       204.0         1.6635             67000.0  

Merged DataFrame:
   col1  col2 col3