In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import MLutils as ml

In [2]:
# Load data
columns = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our',
           'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order',
           'word_freq_mail', 'word_freq_receive', 'word_freq_will', 'word_freq_people',
           'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business',
           'word_freq_email', 'word_freq_you', 'word_freq_credit', 'word_freq_your',
           'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp', 'word_freq_hpl',
           'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs',
           'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
           'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm',
           'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original',
           'word_freq_project', 'word_freq_re', 'word_freq_edu', 'word_freq_table',
           'word_freq_conference', 'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!',
           'char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest',
           'capital_run_length_total']

spambase = pd.read_csv('spambase.data', header=None, delim_whitespace=False, names=columns + ['spam'])

spam_polluted_train = pd.read_csv('spam_polluted/train_feature.txt', header=None, delim_whitespace=True)
spam_polluted_train['spam'] = pd.read_csv('spam_polluted/train_label.txt', header=None, delim_whitespace=True,
                                          names=['spam'])

spam_polluted_test = pd.read_csv('spam_polluted/test_feature.txt', header=None, delim_whitespace=True)
spam_polluted_test['spam'] = pd.read_csv('spam_polluted/test_label.txt', header=None, delim_whitespace=True,
                                         names=['spam'])

spam_polluted_train.columns = spam_polluted_train.columns.astype(str)
spam_polluted_test.columns = spam_polluted_test.columns.astype(str)

# Combine polluted train and test data
spam_polluted = pd.concat([spam_polluted_train, spam_polluted_test], ignore_index=True)

In [3]:
# Display data
display(spambase)
display(spam_polluted_train)
display(spam_polluted_test)

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,spam
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048,1049,1050,1051,1052,1053,1054,1055,1056,spam
0,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.068515,0.014376,0.068351,0.032469,0.014087,0.034152,0.051189,0.063388,0.043658,1
1,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.050598,0.071449,0.034827,0.037157,0.051147,0.067859,0.052220,0.004742,0.009583,1
2,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.045824,0.011757,0.031530,0.032750,0.073789,0.045900,0.072750,0.040348,0.029986,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.013711,0.054859,0.004493,0.053976,0.029885,0.063413,0.070903,0.026120,0.008427,1
4,0.00,0.00,0.00,0.0,1.85,0.00,0.00,1.85,0.00,0.00,...,0.071441,0.058734,0.058241,0.034914,0.018111,0.019574,0.009803,0.065727,0.058667,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4135,0.00,0.00,1.19,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.008603,0.012331,0.041340,0.021970,0.059619,0.008046,0.015922,0.028007,0.033463,0
4136,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.046379,0.062328,0.057656,0.042930,0.022706,0.001332,0.053998,0.073089,0.056421,0
4137,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.037031,0.046458,0.005940,0.074133,0.045335,0.061015,0.068747,0.028783,0.007017,0
4138,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.017454,0.058703,0.004143,0.016548,0.048861,0.019306,0.076441,0.067222,0.002527,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1048,1049,1050,1051,1052,1053,1054,1055,1056,spam
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.0,0.00,...,0.037503,0.040918,0.000563,0.006243,0.041089,0.014344,0.042052,0.045074,0.058401,1
1,0.00,0.00,0.00,0.0,0.00,0.00,0.96,0.00,0.0,1.92,...,0.016471,0.053028,0.074753,0.044093,0.049550,0.010641,0.051331,0.018565,0.038492,1
2,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,...,0.050797,0.020699,0.011128,0.062373,0.015312,0.002700,0.005830,0.042279,0.031624,1
3,1.17,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,...,0.001639,0.033305,0.046086,0.041225,0.029758,0.071276,0.069395,0.001066,0.071314,1
4,0.30,0.00,0.00,0.0,0.61,0.92,0.00,2.45,0.0,0.00,...,0.008402,0.057602,0.035860,0.050893,0.030966,0.075909,0.015172,0.002239,0.066309,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
456,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,...,0.059033,0.042735,0.032502,0.027365,0.024529,0.042266,0.035607,0.050341,0.019569,0
457,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,...,0.069568,0.052859,0.037492,0.073815,0.075152,0.051175,0.004231,0.053001,0.018082,0
458,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,...,0.039451,0.001632,0.050405,0.040429,0.053173,0.029333,0.029063,0.028139,0.015943,0
459,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.0,0.00,...,0.027839,0.006222,0.064354,0.033661,0.055305,0.027338,0.016306,0.043003,0.044025,0


In [4]:
# Normalize data without labels
spambase.iloc[:, :-1] = ml.normalize(spambase.iloc[:, :-1])[0]
spam_polluted_train.iloc[:, :-1], spam_polluted_train_avg, spam_polluted_train_stdev = \
    ml.normalize(spam_polluted_train.iloc[:, :-1])
spam_polluted_test.iloc[:, :-1] = (spam_polluted_test.iloc[:,
                                   :-1] - spam_polluted_train_avg) / spam_polluted_train_stdev

In [5]:
# Split data using 80/20 split sklearn function
spam_train, spam_test = model_selection.train_test_split(spambase, test_size=0.2, random_state=0)

# Problem 1

## AdaBoost Feature Analysis

In [6]:
ada_spam = ml.AdaBoost(num_classifiers=300, splitter="best")
ada_spam.fit(spam_train)

### Accuracy

In [7]:
print('Accuracy: ', ada_spam.accuracy(spam_test))

Accuracy:  0.9250814332247557


### Feature Importance

In [9]:
print(ada_spam.get_top_features(15))

    Feature  Importance
0        51    1.084820
5        26    0.471287
1         6    0.461775
3        24    0.288452
2        52    0.234118
13       41    0.157328
4        55    0.148654
6        20    0.105483
7        45    0.081390
9        15    0.072486
16       38    0.071874
8        44    0.071731
11       54    0.071247
18       40    0.067786
12        4    0.053180


## Polluted Spam AdaBoost

In [9]:
ada_polluted = ml.AdaBoost()
ada_polluted.fit(spam_polluted_train)

### Accuracy

In [10]:
print('Accuracy: ', ada_polluted.accuracy(spam_polluted_test))

Accuracy:  0.9414316702819957


# Problem 2

## Gaussian Naive Bayes

In [11]:
gnb_spam = ml.GaussianNaiveBayes()

### Accuracy

In [12]:
print('Accuracy: ', gnb_spam.cross_validate(spam_polluted))

Accuracy:  0.6282608695652174


## sklearn PCA

In [13]:
pca = PCA(n_components=100)
pca.fit(spam_polluted)
pca_spam_df = pd.DataFrame(pca.transform(spam_polluted))
pca_spam_df['spam'] = spam_polluted['spam']

### GNB

In [14]:
gnb_pca_spam = ml.GaussianNaiveBayes()

#### Accuracy

In [15]:
print('Accuracy: ', gnb_pca_spam.cross_validate(pca_spam_df))

Accuracy:  0.773695652173913


## Implemented PCA

In [16]:
pca_spam = ml.PCA(100)
pca_spam.fit(spam_polluted)
pca_spam_df = pd.DataFrame(pca_spam.transform(spam_polluted))
pca_spam_df['spam'] = spam_polluted['spam']

### GNB

In [17]:
gnb_pca_spam = ml.GaussianNaiveBayes()

#### Accuracy

In [18]:
print('Accuracy: ', gnb_pca_spam.cross_validate(pca_spam_df))

Accuracy:  0.7797826086956522


# Problem 3

In [19]:
# Load data
missing_train = pd.read_csv('20_percent_missing_train.txt', header=None, delim_whitespace=False)

missing_test = pd.read_csv('20_percent_missing_test.txt', header=None, delim_whitespace=False)

missing_train.columns = missing_train.columns.astype(str)
missing_test.columns = missing_test.columns.astype(str)

missing = pd.concat([missing_train, missing_test], ignore_index=True)

In [20]:
# Display data
display(missing_train)
display(missing_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.0,0.64,,0.0,0.32,0.00,,0.00,,0.00,...,0.000,0.000,,0.778,0.000,0.0,3.756,61.0,,1.0
1,,0.00,0.00,0.0,,,0.31,0.63,0.31,0.63,...,0.000,0.137,,0.137,0.000,0.0,3.537,40.0,191.0,1.0
2,,0.00,0.00,,0.63,,,0.63,0.31,0.63,...,0.000,,0.0,,0.000,,3.537,40.0,191.0,1.0
3,0.0,,,0.0,1.85,0.00,,1.85,0.00,,...,,0.223,0.0,0.000,0.000,0.0,3.000,,54.0,1.0
4,,0.00,0.00,0.0,1.92,0.00,0.00,,,0.64,...,,0.054,0.0,0.164,0.054,0.0,1.671,4.0,112.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3676,,0.00,0.62,0.0,0.00,0.31,0.00,0.00,,,...,0.000,,0.0,0.000,,,1.142,3.0,,0.0
3677,,0.00,0.00,0.0,0.00,,0.00,,0.00,,...,,0.000,0.0,0.353,0.000,0.0,,,,0.0
3678,0.3,,0.30,,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,,0.0,1.404,6.0,118.0,0.0
3679,,,0.00,0.0,0.32,,0.00,,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.0,,,78.0,0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,0.21,0.28,,,,,,0.07,0.00,0.94,...,0.000,0.132,0.000,0.372,,0.048,5.114,101.0,,1.0
1,0.00,0.00,0.25,0.0,0.38,0.25,,0.00,0.00,0.00,...,0.022,0.044,,,0.000,0.000,,11.0,184.0,1.0
2,0.05,0.07,0.10,0.0,0.76,,,,0.55,0.00,...,0.042,,0.016,0.250,0.046,0.059,2.569,,2259.0,1.0
3,,,,0.0,0.00,0.00,0.0,0.00,,,...,,0.352,0.000,2.112,0.000,0.000,3.909,11.0,,1.0
4,,0.00,0.00,,2.94,0.00,,0.00,0.00,0.00,...,0.335,0.335,0.000,0.671,,0.000,4.000,12.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,0.00,,0.00,0.0,0.00,0.00,0.0,0.00,0.00,0.00,...,,,,0.000,0.000,,3.333,,,0.0
916,0.00,0.00,1.02,0.0,0.00,,0.0,0.00,,,...,0.000,,0.000,0.000,0.000,0.000,,5.0,28.0,0.0
917,0.00,0.00,0.00,0.0,,0.00,,,,0.00,...,0.000,0.254,0.000,0.000,0.000,0.000,1.000,,13.0,0.0
918,0.00,0.00,0.00,0.0,0.00,,0.0,0.00,0.00,,...,0.000,0.000,0.000,0.000,0.000,,1.266,,19.0,0.0


## Missing Data Bernoulli Naive Bayes

In [21]:
bnb_missing = ml.MissingValuesBernoulliNaiveBayes()
bnb_missing.fit(missing_train)

### Accuracy

In [22]:
print('Accuracy: ', bnb_missing.accuracy(missing_test))

Accuracy:  0.8076086956521739


# Problem 4

In [114]:
# Load data
news_train = ml.load_newsgroup('8newsgroup/train.trec/feature_matrix.txt')
news_test = ml.load_newsgroup('8newsgroup/test.trec/feature_matrix.txt')

display(news_train)
display(news_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1745,1746,1747,1748,1749,1750,1751,1752,1753,1754
0,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.633042,0.751372,0.000000,0.934572,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,1.233315,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.934572,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,1.079151,0.00000,0.924446,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,0.000000,0.000000,0.422553,0.000000,0.54142,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11310,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11311,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11312,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1745,1746,1747,1748,1749,1750,1751,1752,1753,1754
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## L1 Feature Selection

In [115]:
l1 = LogisticRegression(penalty='l1', solver='liblinear')
l1.fit(news_train.iloc[:, :-1], news_train.iloc[:, -1])

### Select Top 200 Features

In [117]:
feature_indices = np.argsort(np.abs(l1.coef_))[0, -200:]
news_train_selected = news_train.iloc[:, feature_indices]
news_test_selected = news_test.iloc[:, feature_indices]

In [118]:
# Display data
display(news_train_selected)
display(news_test_selected)

Unnamed: 0,992,920,1541,16,497,84,648,802,785,921,...,737,766,1589,461,471,469,455,451,441,1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.751372
2,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.523528,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
11310,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
11311,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.172719,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000
11312,0.0,0.0,0.0,0.0,0.0,0.0,0.286889,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000


Unnamed: 0,992,920,1541,16,497,84,648,802,785,921,...,737,766,1589,461,471,469,455,451,441,1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
# Append labels
news_train_selected['label'] = news_train.iloc[:, -1]
news_test_selected['label'] = news_test.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_train_selected['label'] = news_train.iloc[:, -1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  news_test_selected['label'] = news_test.iloc[:, -1]


## L2 Regression

In [120]:
l2 = LogisticRegression(penalty='l2', solver='liblinear')
l2.fit(news_train_selected.iloc[:, :-1], news_train_selected.iloc[:, -1])

### Accuracy Per Class

In [121]:
print(classification_report(l2.predict(news_test_selected.iloc[:, :-1]), news_test_selected.iloc[:, -1]))

              precision    recall  f1-score   support

         0.0       0.88      0.92      0.90       933
         1.0       0.94      0.71      0.81      3618
         2.0       0.59      0.79      0.68       294
         3.0       0.69      0.88      0.77       624
         4.0       0.85      0.91      0.88       749
         5.0       0.23      0.74      0.35       124
         6.0       0.47      0.79      0.59       238
         7.0       0.75      0.83      0.79       952

    accuracy                           0.79      7532
   macro avg       0.68      0.82      0.72      7532
weighted avg       0.84      0.79      0.80      7532
