DONE
Step 1: Preprocess the data - use the same function across the board
Clearing out the capitalization
Clearing out em dashes, symbols
Clearing out names

TO DO
Step 2: Implementing the features
- Goal: Train a logistic regression on 2 feature representations

Step 3: Implementing the regression based on that data

# Part 1: Imports and Cleaning Text

In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import numpy as np
from sklearn.linear_model import LogisticRegression


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Read in data
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/val.csv')

In [4]:
# Helper function for cleaning text
def clean_html(text):
    if pd.isna(text):
        return text
    # Remove HTML tags
    clean = re.sub(r'<.*?>', '', str(text))
    # Remove extra whitespaces
    clean = re.sub(r'\s+', ' ', clean).strip()
    # Replace HTML entities
    clean = re.sub(r'&amp;', '&', clean)
    clean = re.sub(r'&lt;', '<', clean)
    clean = re.sub(r'&gt;', '>', clean)
    clean = re.sub(r'&quot;|&#34;', '"', clean)
    clean = re.sub(r'&apos;|&#39;', "'", clean)
    return clean

In [5]:
# use the clean_html function to clean the training data
train['snip'] = train['snip'].apply(clean_html)
val['snip'] = val['snip'].apply(clean_html)

print(train)
print(val)

                                                    snip   channel
0      first of all, it feels like covid again but in...  FOXNEWSW
1      to be a software drivenrganization where softw...     CSPAN
2      you discuss the power of ai to revolutionize t...    CSPAN2
3      ai bots like chatgpt and google's bard gained ...   BBCNEWS
4      . >> i could sleep ten hours ai night if i was...  FOXNEWSW
...                                                  ...       ...
19868  cardiovascular science, but they're also pione...  FOXNEWSW
19869  i of ai in different fields. have of ai in dif...   BBCNEWS
19870  weighing down on the major averages, both tech...      KTVU
19871  i also think crypto ai that legislation be fro...    CSPAN2
19872  as we have worked to monitor the adoption iden...    CSPAN2

[19873 rows x 2 columns]
                                                   snip    channel
0     . ♪ >> there's a kyu cho right have things tha...  BLOOMBERG
1     he says the ai tool helped cre

In [10]:
# evaluation metric equation
def eval(y_pred, y_true):
    correct = (y_pred == y_true)   # Boolean array: True if correct, False if wrong
    accuracy = correct.sum() / len(y_true)  # Correct / Total
    return accuracy

In [6]:
# Initialize and fit TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(train['snip'])

# Calculate word complexity per snip
word_complexity = X_tfidf.sum(axis=1) / (X_tfidf != 0).sum(axis=1)
word_complexity = np.array(word_complexity).flatten()

# Add it to the train dataframe
train['word_complexity'] = word_complexity

print(train)

                                                    snip   channel  \
0      first of all, it feels like covid again but in...  FOXNEWSW   
1      to be a software drivenrganization where softw...     CSPAN   
2      you discuss the power of ai to revolutionize t...    CSPAN2   
3      ai bots like chatgpt and google's bard gained ...   BBCNEWS   
4      . >> i could sleep ten hours ai night if i was...  FOXNEWSW   
...                                                  ...       ...   
19868  cardiovascular science, but they're also pione...  FOXNEWSW   
19869  i of ai in different fields. have of ai in dif...   BBCNEWS   
19870  weighing down on the major averages, both tech...      KTVU   
19871  i also think crypto ai that legislation be fro...    CSPAN2   
19872  as we have worked to monitor the adoption iden...    CSPAN2   

       word_complexity  
0             0.057407  
1             0.079768  
2             0.076151  
3             0.077301  
4             0.075782  
...      

In [7]:
# Transform validation snips using the same TF-IDF vectorizer
x_val_tfidf = vectorizer.transform(val['snip'])

# Compute complexity
val_word_complexity = x_val_tfidf.sum(axis=1) / (x_val_tfidf != 0).sum(axis=1)
val_word_complexity = np.array(val_word_complexity).flatten()

# Add to val DataFrame
val['word_complexity'] = val_word_complexity

print(val)

                                                   snip    channel  \
0     . ♪ >> there's a kyu cho right have things tha...  BLOOMBERG   
1     he says the ai tool helped create a new fronti...       KPIX   
2     . >> the all new godaddy arrow put your busine...       CNNW   
3     in some cases they are powered by generative a...      CSPAN   
4     this was a ivotal it comes to ai. this was a p...    BBCNEWS   
...                                                 ...        ...   
3034  however, the ai trade is only one part of the ...       CNBC   
3035  oz but also was highlighted as a product by cr...     CSPAN2   
3036  the all new godaddy airo helps you get your bu...       CNBC   
3037  we are going to be way ahead on ai. we have to...       CNBC   
3038  his fourth management role after spells at der...    BBCNEWS   

      word_complexity  
0            0.088216  
1            0.081808  
2            0.074722  
3            0.082919  
4            0.083567  
...            

In [8]:
# X = features, y = labels
x_train = train[['word_complexity']]  # Needs to be 2D
y_train = train['channel']

x_val = val[['word_complexity']]
y_val = val['channel']

In [11]:
# Train
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# Predict
val_preds = model.predict(x_val)
print(val_preds)

# Evaluate
val_accuracy = eval(val_preds, y_val)
print(val_accuracy)

['CNNW' 'CNNW' 'CNNW' ... 'CNNW' 'CNNW' 'CNNW']
0.07535373478117802


now do the min/max training to create new average values of scores

scaled = current - min/ max-min

In [12]:
min = 1
max = 0

complexity = train['word_complexity']
print(complexity)

for i in range(len(complexity)):
    if complexity[i] <= min:
        min = complexity[i]
    if complexity[i] >= max:
        max = complexity[i]
    else:
        min = min
        max = max

print(min,max)

0        0.057407
1        0.079768
2        0.076151
3        0.077301
4        0.075782
           ...   
19868    0.080295
19869    0.093252
19870    0.080397
19871    0.079097
19872    0.080723
Name: word_complexity, Length: 19873, dtype: float64
0.035929255415410526 0.6822632439493239


In [13]:
# complexity is 2D
def scale(complexity):
    x_train = (train[['word_complexity']] - min )/ (max-min)
    return x_train

In [39]:
min_complexity = train["word_complexity"].min()
max_complexity = train["word_complexity"].max()
train["word_complexity_scaled"] = (train["word_complexity"] - min_complexity) / (max_complexity - min_complexity)
val["word_complexity_scaled"] = (val["word_complexity"] - min_complexity) / (max_complexity - min_complexity)

# X = features, y = labels
x_train = train[['word_complexity_scaled']]
y_train = train['channel']

x_val = val[['word_complexity_scaled']]
y_val = val['channel']

In [41]:
# Train
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# Predict
val_preds = model.predict(x_val)
print(val_preds)

# Evaluate
val_accuracy = eval(val_preds, y_val)
print(val_accuracy)

['CNNW' 'CNNW' 'CNNW' ... 'CNNW' 'CNNW' 'CNNW']
0.07666995722277065


In [42]:
# create list of all of the channels
channels = []

for i in range(len(train)):
    if train['channel'][i] not in channels:
        channels.append(train['channel'][i])
    else:
        channels = channels

# 

print(channels)


['FOXNEWSW', 'CSPAN', 'CSPAN2', 'BBCNEWS', 'GBN', 'KPIX', 'KGO', 'KNTV', '1TV', 'KRON', 'CSPAN3', 'SFGTV', 'RUSSIA24', 'KSTS', 'BLOOMBERG', 'MSNBCW', 'PRESSTV', 'KTVU', 'CNNW', 'FBC', 'CNBC', 'RUSSIA1', 'KDTV', 'DW', 'KQED', 'NTV', 'BELARUSTV', 'ALJAZ', 'RT', 'LINKTV', 'COM']


In [18]:
train_corpus = list(train['snip'].values)
val_corpus = list(val['snip'].values)
nlp_sentiment = pipeline("sentiment-analysis")
train["Sentiment"] = nlp_sentiment(train_corpus)
val["Sentiment"] = nlp_sentiment(val_corpus)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use mps:0


In [43]:
train['Sentiment_Label'] = [x.get('label') for x in train['Sentiment']]
train['Sentiment_Score'] = [x.get('score') for x in train['Sentiment']]
val['Sentiment_Label'] = [x.get('label') for x in val['Sentiment']]
val['Sentiment_Score'] = [x.get('score') for x in val['Sentiment']]

train.head()

Unnamed: 0,snip,channel,word_complexity,Sentiment,Sentiment_Label,Sentiment_Score,Sentiment_scaled,word_complexity_scaled
0,"first of all, it feels like covid again but in...",FOXNEWSW,0.057407,"{'label': 'NEGATIVE', 'score': 0.9903525710105...",NEGATIVE,0.990353,0.004713,0.033231
1,to be a software drivenrganization where softw...,CSPAN,0.079768,"{'label': 'POSITIVE', 'score': 0.9933253526687...",POSITIVE,0.993325,0.996727,0.067826
2,you discuss the power of ai to revolutionize t...,CSPAN2,0.076151,"{'label': 'NEGATIVE', 'score': 0.9971946477890...",NEGATIVE,0.997195,0.001292,0.06223
3,ai bots like chatgpt and google's bard gained ...,BBCNEWS,0.077301,"{'label': 'POSITIVE', 'score': 0.9900817275047...",POSITIVE,0.990082,0.995105,0.064009
4,. >> i could sleep ten hours ai night if i was...,FOXNEWSW,0.075782,"{'label': 'NEGATIVE', 'score': 0.8428422808647...",NEGATIVE,0.842842,0.078481,0.06166


In [44]:
train["Sentiment_Score"] = np.where(
    train["Sentiment_Label"] == "NEGATIVE", -(train["Sentiment_Score"]), train["Sentiment_Score"]
)

val["Sentiment_Score"] = np.where(
    val["Sentiment_Label"] == "NEGATIVE", -(val["Sentiment_Score"]), val["Sentiment_Score"]
)

train.head()

Unnamed: 0,snip,channel,word_complexity,Sentiment,Sentiment_Label,Sentiment_Score,Sentiment_scaled,word_complexity_scaled
0,"first of all, it feels like covid again but in...",FOXNEWSW,0.057407,"{'label': 'NEGATIVE', 'score': 0.9903525710105...",NEGATIVE,-0.990353,0.004713,0.033231
1,to be a software drivenrganization where softw...,CSPAN,0.079768,"{'label': 'POSITIVE', 'score': 0.9933253526687...",POSITIVE,0.993325,0.996727,0.067826
2,you discuss the power of ai to revolutionize t...,CSPAN2,0.076151,"{'label': 'NEGATIVE', 'score': 0.9971946477890...",NEGATIVE,-0.997195,0.001292,0.06223
3,ai bots like chatgpt and google's bard gained ...,BBCNEWS,0.077301,"{'label': 'POSITIVE', 'score': 0.9900817275047...",POSITIVE,0.990082,0.995105,0.064009
4,. >> i could sleep ten hours ai night if i was...,FOXNEWSW,0.075782,"{'label': 'NEGATIVE', 'score': 0.8428422808647...",NEGATIVE,-0.842842,0.078481,0.06166


In [45]:
min_sentiment = train["Sentiment_Score"].min()
max_sentiment = train["Sentiment_Score"].max()

train["Sentiment_scaled"] = (train["Sentiment_Score"] - min_sentiment) / (max_sentiment - min_sentiment)
val["Sentiment_scaled"] = (val["Sentiment_Score"] - min_sentiment) / (max_sentiment - min_sentiment)
train.head()


Unnamed: 0,snip,channel,word_complexity,Sentiment,Sentiment_Label,Sentiment_Score,Sentiment_scaled,word_complexity_scaled
0,"first of all, it feels like covid again but in...",FOXNEWSW,0.057407,"{'label': 'NEGATIVE', 'score': 0.9903525710105...",NEGATIVE,-0.990353,0.004713,0.033231
1,to be a software drivenrganization where softw...,CSPAN,0.079768,"{'label': 'POSITIVE', 'score': 0.9933253526687...",POSITIVE,0.993325,0.996727,0.067826
2,you discuss the power of ai to revolutionize t...,CSPAN2,0.076151,"{'label': 'NEGATIVE', 'score': 0.9971946477890...",NEGATIVE,-0.997195,0.001292,0.06223
3,ai bots like chatgpt and google's bard gained ...,BBCNEWS,0.077301,"{'label': 'POSITIVE', 'score': 0.9900817275047...",POSITIVE,0.990082,0.995105,0.064009
4,. >> i could sleep ten hours ai night if i was...,FOXNEWSW,0.075782,"{'label': 'NEGATIVE', 'score': 0.8428422808647...",NEGATIVE,-0.842842,0.078481,0.06166


In [48]:
x_train = train[['word_complexity_scaled', 'Sentiment_scaled']]  # Needs to be 2D
x_val = val[['word_complexity_scaled', 'Sentiment_scaled']]
model = LogisticRegression(max_iter=1000)

model.fit(x_train, y_train)

# Predict
val_preds = model.predict(x_val)
print(val_preds)

# Evaluate
val_accuracy = eval(val_preds, y_val)
print(val_accuracy)

['CNNW' 'CNNW' 'CNNW' ... 'CNNW' 'CNNW' 'CNNW']
0.07634090161237249


In [49]:
print(train["channel"].value_counts())

channel
CNNW         2725
FBC          1608
BBCNEWS      1576
BLOOMBERG    1441
CNBC         1319
MSNBCW       1141
FOXNEWSW     1106
CSPAN         913
CSPAN2        904
KNTV          842
KTVU          766
KGO           763
KRON          760
GBN           717
CSPAN3        664
KPIX          400
SFGTV         360
DW            253
ALJAZ         230
NTV           209
KDTV          176
1TV           158
KSTS          142
RUSSIA24      122
PRESSTV       115
KQED          113
BELARUSTV     109
RUSSIA1       108
RT             50
LINKTV         43
COM            40
Name: count, dtype: int64
