In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the dataset (Replace with the path to your dataset)
url = "https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv"
data = pd.read_csv(url)

In [3]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   

        b  lstat  medv  
0  396.90   4.98  24.0  
1  396.90   9.14  21.6  
2  392.83   4.03  34.7  
3  394.63   2.94  33.4  
4  396.90   5.33  36.2  


In [4]:
# Step 1: Preprocessing - Check for null values
print("\nChecking for null values in each column:")
print(data.isnull().sum())


Checking for null values in each column:
crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
b          0
lstat      0
medv       0
dtype: int64


In [5]:
# Step 2: Features and Target Variables
# Features (predictors)
X = data.drop('medv', axis=1)  # 'medv' is the target (house price)
# Target (House Price)
y = data['medv']

In [6]:
# Step 3: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Step 4: Standardize the features (important for some models but not strictly necessary for linear regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
# Step 5: Train the Multiple Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [9]:
# Step 6: Make predictions on the test data
y_pred = model.predict(X_test_scaled)

In [10]:
# Step 7: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("\nModel Performance:")
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Model Performance:
Mean Squared Error: 24.291119474973495
R^2 Score: 0.6687594935356322


In [11]:
# Step 8: Display the predicted vs actual values
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print("\nPredicted vs Actual values:")
print(results.head())


Predicted vs Actual values:
     Actual  Predicted
173    23.6  28.996724
274    32.4  36.025565
491    13.6  14.816944
72     22.8  25.031979
452    16.1  18.769880


In [12]:
#q2

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report

In [14]:
df = pd.read_csv('crash.csv')

In [15]:
import pandas as pd
import numpy as np

# Number of samples
num_samples = 100

# Generate sample data
np.random.seed(0)  # For reproducibility
data = {
    'age': np.random.randint(18, 80, size=num_samples),
    'speed': np.random.uniform(10, 100, size=num_samples),
    'survived': np.random.choice([0, 1], size=num_samples)
}

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('crash.csv', index=False)

print("Dataset 'crash.csv' has been created successfully.")


Dataset 'crash.csv' has been created successfully.


In [16]:
print(df.head()) # Display the first few rows of the dataset
print(df.info()) # Check for missing values and data types
print(df.describe())

   age      speed  survived
0   62  67.084665         0
1   65  96.305434         0
2   71  68.751129         1
3   18  67.155299         1
4   21  99.576961         1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       100 non-null    int32  
 1   speed     100 non-null    float64
 2   survived  100 non-null    int32  
dtypes: float64(1), int32(2)
memory usage: 1.7 KB
None
              age       speed    survived
count  100.000000  100.000000  100.000000
mean    46.620000   57.857976    0.590000
std     18.117897   25.366410    0.494311
min     18.000000   10.842103    0.000000
25%     31.750000   38.804353    0.000000
50%     46.500000   58.160295    1.000000
75%     60.500000   77.581365    1.000000
max     77.000000   99.576961    1.000000


In [17]:
df=df.dropna()

In [18]:
X = df[['age', 'speed']] # Features: age and speed
y = df['survived']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)

In [22]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.45


In [23]:
# Display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[0 7]
 [4 9]]


In [24]:
# Display the classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.56      0.69      0.62        13

    accuracy                           0.45        20
   macro avg       0.28      0.35      0.31        20
weighted avg       0.37      0.45      0.40        20



In [25]:
# Step 8: Make Predictions
# Predicting the survivability on the test set
predicted_survivability = model.predict(X_test)

print("Predicted Survivability:", predicted_survivability)
print("Actual Outcomes:", y_test.values)

Predicted Survivability: [1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 0 1 0]
Actual Outcomes: [0 0 0 1 0 1 0 0 1 0 1 1 1 1 1 1 1 1 1 1]
