In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler

<h2> Feature Selection on OnlineNewsPopularity </h2>

Exercise
1. Load the dataset 
2. Drop the Column which isn't required


In [2]:
 # read the csv file from the link provided
 # drop the column that is not required from the dataset(url)

df = pd.read_csv('8_OnlineNewsPopularity.csv')
df = df.drop('url', axis=1)
df.to_csv('8_OnlineNewsPopularity_modified.csv', index=False)
df = pd.read_csv('8_OnlineNewsPopularity_modified.csv')

In [3]:
df.head()

Unnamed: 0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,0.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,0.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,0.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,0.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505


In [4]:
# strip the whitespace in the column names
df.columns = df.columns.str.strip()

Exercise 
1. Scale the data using a appropriate scaler and re-asign the column names after scaling.
2. The function below should return scaled result in the form of DataFrame

In [5]:
# hint: Use MinMaxScaler for scaling
def scaling_data(data):
    cols = list(data.columns)
    scalar = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scalar.fit_transform(data)
    scaled_data = pd.DataFrame(scaled_data, columns=cols)
    scaled_data['shares'] = df['shares']
    return scaled_data

In [6]:
transformed_data = scaling_data(df)
transformed_data.head()

Unnamed: 0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,1.0,0.47619,0.025844,0.000947,0.00096,0.001254,0.013158,0.017241,0.007812,0.0,...,0.1,0.7,0.65,0.4,0.8,0.5,0.40625,0.0,0.1875,593
1,1.0,0.333333,0.030092,0.000863,0.00096,0.001218,0.009868,0.008621,0.007812,0.0,...,0.033333,0.7,0.88125,0.875,0.9,0.0,0.5,1.0,0.0,711
2,1.0,0.333333,0.0249,0.00082,0.00096,0.001021,0.009868,0.008621,0.007812,0.0,...,0.1,1.0,0.533333,0.2,0.866667,0.0,0.5,1.0,0.0,1500
3,1.0,0.333333,0.062662,0.000719,0.00096,0.001024,0.029605,0.0,0.007812,0.0,...,0.136364,0.8,0.630303,0.4,0.833333,0.0,0.5,1.0,0.0,1200
4,1.0,0.52381,0.126505,0.000593,0.00096,0.000832,0.0625,0.163793,0.15625,0.0,...,0.033333,1.0,0.779808,0.5,0.95,0.454545,0.568182,0.090909,0.136364,505


5. Perform train_test_split

In [7]:
X = transformed_data.loc[:,:'abs_title_sentiment_polarity']
y = transformed_data['shares']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=50)
y_train = pd.DataFrame(y_train)
y_test = pd.DataFrame(y_test)
y_train =y_train.astype('int')

6. Write a function which returns the list of k-Best features where k being the number of features required

In [8]:
#use chi2
def k_best(X_train, y_train, k):
    # SelectKBest class to get k best features
    selector = SelectKBest(score_func=chi2, k=k)
    X_best_features = selector.fit_transform(X_train, y_train)
    
    # return boolean indices
    best_features_indices = selector.get_support()
    
    best_features = []
    
    data_columns = X_train.columns
    
    for index, bool_value in enumerate(best_features_indices):
        # best features to the best_features list
        if bool_value:
            best_features.append(data_columns[index])
    
    return best_features

Print the results

In [9]:
features_one = k_best(X_train, y_train, 4)
features_two = k_best(X_train, y_train, 6)

In [10]:
features_one

['data_channel_is_entertainment',
 'data_channel_is_socmed',
 'data_channel_is_world',
 'is_weekend']

In [11]:
features_two

['data_channel_is_lifestyle',
 'data_channel_is_entertainment',
 'data_channel_is_socmed',
 'data_channel_is_world',
 'weekday_is_saturday',
 'is_weekend']

<h3>=> Describe about feature selection and explain your code in detail?</h3>


#### Answer
Feature selection is a technique used in machine learning to select a relevant subset of features from a large set of input features. The main purpose of feature selection is to improve the model's accuracy by removing irrelevant or redundant features and reducing the model's complexity.
There are various methods of feature selection, such as filter methods, wrapper methods, and embedded methods. 
In the above code, I used a chi-squared test to select the k-best features that are most relevant to the output target.
<br>
I used the SelectKBest class with chi2 function from the sklearn.feature_selection module to choose the k-best features, which were transformed using the fit_transform method. I also used the get_support method to create a boolean mask of the selected features and obtain the column names of the selected features using the columns attribute of the input features. The function will return list of best features provided number of best features.

<h2> Model selection on Algerian_forest_fires_dataset_UPDATE-1 dataset  </h2>
<h3># Exercise (Hint use Logistic Regression, SVC, Ridge and Lasso to compare the models.)</h3>
<h4>LinearSVC  Vs  Logistic regression</h4>
<p>Your task is to findout which of the above models is best suited for the given dataset and give reasons in this scenario. </p>
<p>Also, you need to give scenarios which each of these Models work better over the other.</p>


1. Load the dataset

In [12]:
data = pd.read_csv('8_Algerian_forest_fires_dataset_UPDATE-1.csv')

# Check the first few rows of the loaded dataset
data.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,not fire
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,not fire
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,not fire
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,not fire
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,not fire


In [13]:
data = pd.read_csv('8_Algerian_forest_fires_dataset_UPDATE-1.csv')
df = data.iloc[124:] 
df = pd.concat([data.iloc[:120], data.iloc[125:]])
df['Classes  '] = df['Classes  '].str.strip()

In [14]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['Classes  '])
df['Classes  '] = le.transform(df['Classes  '])
df = df.drop(index=[167])
df

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,Classes
0,01,06,2012,29,57,18,0,65.7,3.4,7.6,1.3,3.4,0.5,1
1,02,06,2012,29,61,13,1.3,64.4,4.1,7.6,1,3.9,0.4,1
2,03,06,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,1
3,04,06,2012,25,89,13,2.5,28.6,1.3,6.9,0,1.7,0,1
4,05,06,2012,27,77,16,0,64.8,3,14.2,1.2,3.9,0.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
241,26,09,2012,30,65,14,0,85.4,16,44.5,4.5,16.9,6.5,0
242,27,09,2012,28,87,15,4.4,41.1,6.5,8,0.1,6.2,0,1
243,28,09,2012,27,87,29,0.5,45.9,3.5,7.9,0.4,3.4,0.2,1
244,29,09,2012,24,54,18,0.1,79.7,4.3,15.2,1.7,5.1,0.7,1


* Drop the unnecessary columns and use train_test_split

In [15]:
from sklearn.preprocessing import StandardScaler

X = df.drop(columns=['Classes  ', 'day', 'month', 'year'])

# Store the target variable in y
y = df['Classes  ']

# Scale the feature data using StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Use train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Apply Logistic Regression, SVC, Ridge and Lasso to compare the models.

### Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, r2_score

# Create a Logistic Regression model with random_state=0
logistic = LogisticRegression(random_state=0)

# Train the model on the training data
logistic.fit(X_train, y_train)

# Predict on the test data
y_pred = logistic.predict(X_test)

# Convert predicted probabilities to class labels
y_pred = y_pred.round().astype(int)

# Calculate accuracy on test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

logistic_r2 =  r2_score(y_test, y_pred)
print("R-Square:", logistic_r2)

Accuracy: 0.96
R-Square: 0.8222222222222222


### SVC Model

In [17]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, r2_score

# Create a SVC model with gamma=auto
svc = make_pipeline(StandardScaler(), SVC(gamma='auto'))

# Train the model on the training data
svc.fit(X_train, y_train)

# Predict on the test data
y_pred = svc.predict(X_test)

# Convert predicted probabilities to class labels
y_pred = y_pred.round().astype(int)

# Calculate accuracy on test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

svc_r2 =  r2_score(y_test, y_pred)
print("R-Square:", svc_r2)

Accuracy: 0.94
R-Square: 0.762962962962963


### Ridge regularization

In [18]:
from sklearn.linear_model import Ridge
from sklearn.metrics import accuracy_score, r2_score

# Create a Ridge model
ridge = Ridge()

# Train the model on the training data
ridge.fit(X_train, y_train)

# Predict on the test data
y_pred = ridge.predict(X_test)

# Convert predicted probabilities to class labels
y_pred = y_pred.round().astype(int)

# Calculate accuracy on test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

ridge_r2 =  r2_score(y_test, y_pred)
print("R-Square:", ridge_r2)

Accuracy: 0.94
R-Square: 0.762962962962963


### Lasso regularization

In [19]:
from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score, r2_score

# Create a Lasso model with alpha=1
lasso = Lasso(alpha=10)

# Train the model on the training data
lasso.fit(X_train, y_train)

# Predict on the test data
y_pred = lasso.predict(X_test)

# Convert predicted probabilities to class labels
y_pred = y_pred.round().astype(int)

# Calculate accuracy on test set
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

lasso_r2 =  r2_score(y_test, y_pred)
print("R-Square:", lasso_r2)

Accuracy: 0.62
R-Square: -0.6000000000000001


### Comaprison for all model's
<br>
Based on the evaluation metrics of the models, it is difficult to determine which model performs best since the evaluation metrics are not provided for all the models. However, based on the available evaluation metrics, we can say that the logistic regression model and the SVM model have similar accuracy, and their mean absolute error and mean squared error are also similar. The ridge and lasso regression models seem to have lower accuracy and higher mean absolute error and mean squared error than the other two models.
Therefore, based on the available evaluation metrics, we can say that the logistic regression model and the SVM model perform better than the ridge and lasso regression models. However, a more comprehensive analysis is required to make a more definitive conclusion about the best performing model.