In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

In [2]:
# Example dataset
data = {
    "Size_sqft": [300, 400, 200, 100, 500, 500],
    "Bedrooms":  [2,   2,   3,   3, 1, 5],
    "Price":     [600000, 800000, 600000, 300000, 1000000, 300000],
}
 # y= x1 *  x2 *1000

# Convert dictionary to DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,Size_sqft,Bedrooms,Price
0,300,2,600000
1,400,2,800000
2,200,3,600000
3,100,3,300000
4,500,1,1000000
5,500,5,300000


In [3]:

# Predict  =>  "Size_sqft": [350], "Bedrooms": [2]



# Features and target
X = df[["Size_sqft", "Bedrooms"]]
y = df["Price"]

# Train linear regression model
model = LinearRegression()
model.fit(X, y)


# Test sample
test_sample = pd.DataFrame({"Size_sqft": [350], "Bedrooms": [2]})

# Prediction
predicted_price = model.predict(test_sample)
print(f"Predicted Price for Size=350 sqft, Bedrooms=4: {predicted_price[0]:.2f}")


Predicted Price for Size=350 sqft, Bedrooms=4: 724396.78


In [4]:


# Example dataset
data = {
    "Size_sqft": [850, 900, 1200, 1500, 2000, 2500, 3000, 3500],
    "Bedrooms": [2, 2, 3, 3, 4, 4, 5, 5],
    "Price": [250000, 270000, 300000, 310000, 450000, 500000, 550000, 600000],
}

df = pd.DataFrame(data)

# Features and target
X = df[["Size_sqft", "Bedrooms"]]
y = df["Price"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Combine test data with predictions
results = X_test.copy()
results["Actual Price"] = y_test
results["Predicted Price"] = y_pred.round(2)

print("Test Results:")
print(results)
print("\nMean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)


Test Results:
   Size_sqft  Bedrooms  Actual Price  Predicted Price
1        900         2        270000        198842.98
5       2500         4        500000        454462.81
0        850         2        250000        196446.28

Mean Squared Error: 3334986225.8953176
Root Mean Squared Error: 57749.33961436544


![alt text](https://www.statlect.com/images/decision-tree-structure.png "Title")


In [5]:
import numpy as np

def entropy(labels):
    values, counts = np.unique(labels, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities))

# Example dataset (7 good, 3 bad)
labels = ['g','g','g','g','g','g','g','b','b','b']
print("Entropy:", entropy(labels))


Entropy: 0.8812908992306927


In [6]:
def information_gain(parent, left_child, right_child):
    weight_left = len(left_child) / len(parent)
    weight_right = len(right_child) / len(parent)
    
    return entropy(parent) - (
        weight_left * entropy(left_child) +
        weight_right * entropy(right_child)
    )

parent = ['g','g','g','g','g','g','g','b','b','b']
left_child = ['g','g','g','g']
right_child = ['g','g','g','b','b','b']

print("Information Gain:", information_gain(parent, left_child, right_child))


Information Gain: 0.2812908992306927


In [7]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.preprocessing import LabelEncoder

# Data
data = {
    'Name': ['Mike', 'Mary', 'Bill', 'Jim', 'Dave', 'Anne'],
    'Rank': ['Assistant Prof', 'Assistant Prof', 'Professor', 'Associate Prof', 'Assistant Prof', 'Associate Prof'],
    'Years': [3, 7, 2, 7, 6, 3],
    'Tenured': ['no', 'yes', 'yes', 'yes', 'no', 'no']
}
df = pd.DataFrame(data)

# Encode
le_rank = LabelEncoder()
df['Rank_encoded'] = le_rank.fit_transform(df['Rank'])
le_ten = LabelEncoder()
y = le_ten.fit_transform(df['Tenured'])
X = df[['Years', 'Rank_encoded']]  # Order affects tie-breaking in Scikit-learn

# Train
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(X, y)

# Tree structure
print(export_text(clf, feature_names=['Years', 'Rank_encoded']))

# Predict Jeff
rank_prof = le_rank.transform(['Professor'])[0]
X_test = pd.DataFrame({'Years': [4], 'Rank_encoded': [rank_prof]})
pred = clf.predict(X_test)
print(le_ten.inverse_transform(pred)[0])  # Output: 'yes'

|--- Years <= 6.50
|   |--- Years <= 2.50
|   |   |--- class: 1
|   |--- Years >  2.50
|   |   |--- class: 0
|--- Years >  6.50
|   |--- class: 1

no


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

documents = [
    'I love programming in Python',
    'Python is great for machine learning',
    'I hate programming in Java',
    'Java is a versatile language',
    'I am learning data science with Python',
    'Java is mostly used in enterprise applications',
    'Python is easy to learn',
    'I dislike Java because of its verbosity'
]

labels = [1, 1, 0, 0, 1, 0, 1, 0]

# Convert text to numerical features
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(documents)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)

# Train Decision Tree
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

# Test new sentence
new_text = ["I am learning Java for enterprise applications"]
new_vector = vectorizer.transform(new_text)
prediction = model.predict(new_vector)

print("Prediction (1=Python, 0=Java):", prediction[0])


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         2

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

Prediction (1=Python, 0=Java): 0


## Multinomial Naïve Bayes (Text Classification)

In [9]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

nb_pred = nb_model.predict(X_test)
print("Naïve Bayes Report:\n", classification_report(y_test, nb_pred))


Naïve Bayes Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3



## Support Vector Machine (SVM)

In [10]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

svm_pred = svm_model.predict(X_test)
print("SVM Report:\n", classification_report(y_test, svm_pred))


SVM Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       1.00      0.50      0.67         2

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

