In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import seaborn as sns
import numpy as np
from scipy.stats import norm
import math
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_classification


In [None]:
df = pd.read_csv('/content/Breast_cancer_data.csv')
df.head()
target_attributes = 'diagnosis'

In [None]:
def entropy(df, atri):
    count = df[atri].value_counts()
    entropy_value = 0
    total_instance = count.sum()
    for value_counts in count:
        probability = value_counts / total_instance
        entropy_value -= probability * math.log2(probability)
    return entropy_value
print("ENTROPY")
print("DIAGNOSIS:",entropy(df,'diagnosis'))

ENTROPY
DIAGNOSIS: 0.9526351224018599


In [None]:
def gain(df, target_attribute, attribute):
    target_entropy = entropy(df, target_attribute)
    info_gain = 0
    attribute_counts = df[attribute].value_counts()
    total_instances = attribute_counts.sum()
    for value, count in attribute_counts.items():
        subset = df[df[attribute] == value]
        subset_entropy = entropy(subset, target_attribute)
        info_gain += (count / total_instances) * subset_entropy
    info_gain = target_entropy - info_gain
    return info_gain

In [None]:
def split(df, prediction):
    col_name = df.columns
    col = col_name[1:-1]
    size = len(df.index)
    margin = 0
    split_col = col[0]
    e = entropy(df, prediction)
    for i in col:
        g = gain(df, prediction, i)
        if margin < g:
            split_col = i
            margin = g
    return split_col
split_attribute = split(df, "diagnosis")
print("Splitting attribute:", split_attribute)

Splitting attribute: mean_area


In [None]:
def iD3(df, target_attribute, attributes):
    if df.empty:
        return None
    if len(df[target_attribute].unique()) == 1:
        return df[target_attribute].iloc[0]
    split_attribute = split(df, target_attribute)
    root = {"ATTRIBUTE": split_attribute, "CHILDREN": {}}
    for value in df[split_attribute].unique():
        subset = df[df[split_attribute] == value]
        root["CHILDREN"][value] = iD3(subset, target_attribute, attributes.drop(split_attribute))
    return root

target_attribute = 'diagnosis'
attributes = df.columns.drop(target_attribute)

tree = iD3(df, target_attribute, attributes)
print(tree)

{'ATTRIBUTE': 'mean_area', 'CHILDREN': {1001.0: 0, 1326.0: 0, 1203.0: 0, 386.1: 0, 1297.0: 0, 477.1: 0, 1040.0: 0, 577.9: 0, 519.8: 0, 475.9: 0, 797.8: 0, 781.0: 0, 1123.0: 0, 782.7: 0, 578.3: 0, 658.8: {'ATTRIBUTE': 'mean_texture', 'CHILDREN': {27.54: 0, 13.66: 1}}, 684.5: {'ATTRIBUTE': 'mean_texture', 'CHILDREN': {20.13: 0, 15.51: 1}}, 798.8: 0, 1260.0: 0, 566.3: 1, 520.0: 1, 273.9: 1, 704.4: 0, 1404.0: 0, 904.6: 0, 912.7: 0, 644.8: 0, 1094.0: 0, 732.4: 0, 955.1: 0, 1088.0: 0, 440.6: 0, 899.3: 0, 1162.0: 0, 807.2: 0, 869.5: 0, 633.0: 0, 523.8: 1, 698.8: 0, 559.2: {'ATTRIBUTE': 'mean_texture', 'CHILDREN': {20.82: 0, 10.94: 1}}, 563.0: 0, 371.1: 0, 1104.0: 0, 545.2: 0, 531.5: 0, 1076.0: 0, 201.9: 1, 534.6: 0, 449.3: 1, 561.0: 1, 427.9: 1, 571.8: 1, 437.6: 1, 1033.0: 0, 712.8: 0, 409.0: 1, 1152.0: 0, 656.9: 0, 527.2: 1, 224.5: 1, 311.9: 1, 221.8: 1, 645.7: 0, 260.9: 1, 499.0: 0, 668.3: 0, 269.4: 1, 394.1: 1, 250.5: 1, 502.5: 1, 1130.0: 0, 244.0: 1, 929.4: 0, 584.1: 0, 470.9: 1, 817.7: 0

In [None]:
def bootstrap(data,sample,size):
  bootstrap_samples = []
  for _ in range(sample):
        indices = np.random.choice(len(data), size, replace=True)
        bootstrap_sample = data.iloc[indices].copy()
        bootstrap_samples.append(bootstrap_sample)
  return bootstrap_samples
original_data = pd.DataFrame({'Values': np.random.randint(0, 100, 100)})
num_samples = int(input("ENTER NUMBER OF SAMPLES"))
sample_size = int(input("ENTER THE SIZE OF SAMPLE"))
ans = bootstrap(df,num_samples,sample_size)
ans

ENTER NUMBER OF SAMPLES5
ENTER THE SIZE OF SAMPLE3


[     mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness  \
 319        12.43         17.00           78.60      477.3          0.07557   
 284        12.89         15.70           84.08      516.6          0.07818   
 269        10.71         20.39           69.50      344.9          0.10820   
 
      diagnosis  
 319          1  
 284          1  
 269          1  ,
      mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness  \
 397        12.80         17.46           83.05      508.3          0.08044   
 179        12.81         13.06           81.29      508.8          0.08739   
 283        16.24         18.77          108.80      805.1          0.10660   
 
      diagnosis  
 397          1  
 179          1  
 283          0  ,
      mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness  \
 105       13.110         15.56           87.21      530.2           0.1398   
 66         9.465         21.01           60.11      269.4  

In [None]:
def train_decision_tree(bootstrap_sample, target_column):
    decision_tree = DecisionTreeClassifier()
    X = bootstrap_sample.drop(columns=[target_column])
    y = bootstrap_sample[target_column]
    decision_tree.fit(X, y)
    return decision_tree


In [None]:
def predict_with_decision_tree(decision_tree, data):
    return decision_tree.predict(data)


In [None]:
def aggregate_predictions(predictions):
    return mode(predictions, axis=0)[0].ravel()


In [None]:
def random_forest_predict(original_data, num_samples, sample_size, target_column):
    bootstrap_samples = bootstrap(original_data, num_samples, sample_size)
    decision_trees = [train_decision_tree(bootstrap_sample, target_column) for bootstrap_sample in bootstrap_samples]
    predictions = [predict_with_decision_tree(decision_tree, original_data.drop(columns=[target_column])) for decision_tree in decision_trees]
    final_prediction = aggregate_predictions(predictions)
    return final_prediction


In [None]:
final_prediction = random_forest_predict(df, num_samples, sample_size, target_attributes)
print(final_prediction)

NameError: name 'mode' is not defined

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=100, class_weight='balanced')

random_forest.fit(X_train_scaled, y_train)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
accuracy = random_forest.score(X_test_scaled, y_test)
print("Accuracy:", accuracy)

In [None]:
from sklearn import datasets
iris = datasets.load_iris()
colnames=['sepal_length','sepal_width','petal_length','petal_width','type']

In [None]:
print(iris.target_names)
print(iris.feature_names)

In [None]:
print(iris.data[0:5])
print(iris.target)

In [None]:
from sklearn.model_selection import train_test_split
clf = DecisionTreeClassifier(criterion = 'gini')
X = iris.drop(columns='type')
y = iris['type']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
clf=clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("ACCURACY:",metrics.accuracy_score(y_test,y_pred))

AttributeError: drop

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)



In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(bootstrap=True,
                             class_weight=None,
                             criterion='gini',
                             max_depth=None,
                             max_features='auto',
                             max_leaf_nodes=None,
                             min_impurity_decrease=0.0,
                             min_samples_leaf=1,
                             min_samples_split=2)


In [None]:
feature_imp = pd.Series(clf.feature_importances_,index=iris.feature_names).sort_values(ascending=False)
print(feature_imp)

In [None]:
sns.barplot(x=feature_imp,y=feature_imp.index)
plt.xlabel('FEATURE IMPORTANCE SCORE')
plt.ylabel('FEATURES')
plt.title('VISULAIZE IMPORTANT FEATURE')
plt.legend()
plt.show()

In [None]:
clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
y_predict=clf.predict(X_test)
print('ACCURACY: ',metrics.accuracy_score(y_test,y_pred))

In [None]:
X = df.iloc[:, :-1]
y=df['diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train, y_train)
y_pred_full = rf_full.predict(X_test)
accuracy_full = accuracy_score(y_test, y_pred_full)

In [None]:
from sklearn.feature_selection import SelectFromModel

sfm = SelectFromModel(rf_full, threshold=0.05)
X_train_sfm = sfm.fit_transform(X_train, y_train)
X_test_sfm = sfm.transform(X_test)

In [None]:
rf_sfm = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sfm.fit(X_train_sfm, y_train)
y_pred_sfm = rf_sfm.predict(X_test_sfm)
accuracy_sfm = accuracy_score(y_test, y_pred_sfm)

print("Breast Cancer Dataset:")
print("Accuracy without feature selection:", accuracy_full)
print("Accuracy with feature selection:", accuracy_sfm)

In [None]:
df1 = pd.read_csv('/content/Iris.csv')
X = df1.drop(columns=['Species'])
y = df1['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
rf_full = RandomForestClassifier(n_estimators=100, random_state=42)
rf_full.fit(X_train, y_train)
y_pred_full = rf_full.predict(X_test)
accuracy_full = accuracy_score(y_test, y_pred_full)

In [None]:
sfm = SelectFromModel(rf_full, threshold=0.05)
X_train_sfm = sfm.fit_transform(X_train, y_train)
X_test_sfm = sfm.transform(X_test)

In [None]:
rf_sfm = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sfm.fit(X_train_sfm, y_train)
y_pred_sfm = rf_sfm.predict(X_test_sfm)
accuracy_sfm = accuracy_score(y_test, y_pred_sfm)

print("Iris Dataset:")
print("Accuracy without feature selection:", accuracy_full)
print("Accuracy with feature selection:", accuracy_sfm)