In [25]:
import csv
import joblib
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from sklearn.naive_bayes import MultinomialNB

Getting the most common tags

In [26]:
# Read the CSV file and extract the tags column
with open('medium_articles.csv', 'r',encoding="utf8") as file:
    reader = csv.DictReader(file)
    tags = [row['tags'] for row in reader]
    rows = list(reader)

# Convert the tags lists to a flat list of all tags
all_tags = [tag for tags_list in tags for tag in eval(tags_list)]

# Get the 100 most common tags
most_common_tags = [tag for tag, count in Counter(all_tags).most_common(100)]

In [27]:
most_common_tags

['Blockchain',
 'Data Science',
 'Technology',
 'Programming',
 'Poetry',
 'Cryptocurrency',
 'Machine Learning',
 'Life',
 'Bitcoin',
 'Writing',
 'Politics',
 'Startup',
 'Life Lessons',
 'Self Improvement',
 'Covid 19',
 'Software Development',
 'Love',
 'Python',
 'Business',
 'Health',
 'Mental Health',
 'JavaScript',
 'Relationships',
 'Education',
 'Artificial Intelligence',
 'Culture',
 'Design',
 'Self',
 'Marketing',
 'Entrepreneurship',
 'Ethereum',
 'Music',
 'Productivity',
 'Web Development',
 'History',
 'Humor',
 'Data Visualization',
 'Leadership',
 'Social Media',
 'Psychology',
 'Travel',
 'Fiction',
 'Creativity',
 'Coronavirus',
 'Parenting',
 'Crypto',
 'Science',
 'Tech',
 'UX',
 'Short Story',
 'Money',
 'Spirituality',
 'Family',
 'Work',
 'Deep Learning',
 'Christmas',
 'Finance',
 'News',
 'Personal Development',
 'Data',
 'Art',
 'Women',
 'LGBTQ',
 'Digital Marketing',
 'Inspiration',
 'Books',
 'Philosophy',
 'Investing',
 'Coding',
 'Climate Change',
 'Mo

Based on most common tags we select the 6 main ones by hand and with common sence that they are the most separated ones

In [28]:
main_tags = ['programming','business','health','marketing','politics','sports']

From the dataset we select records which has any of the main tags which were selected previously and add these records to training data

If the record doesn't have main tags in it we add it to the prediction dataset

In [29]:
# Read the CSV file and extract the tags column
with open('medium_articles.csv', 'r', encoding="utf8") as file:
    reader = csv.DictReader(file)
    rows = list(reader)

# Split the tags into a list and convert to lowercase
for row in rows:
    row['tags'] = [tag.lower() for tag in eval(row['tags'])]

# Filter the rows based on the tags
training_data = [row for row in rows if any(tag in main_tags for tag in row['tags'])]
prediction_data = [row for row in rows if not any(tag in main_tags for tag in row['tags'])]

print(len(training_data), 'records selected as training data')
print(len(prediction_data), 'records selected as prediction data')


21835 records selected as training data
170533 records selected as prediction data


We run Naive Bayes algorithm on the training data, we are using TfidfVectorizer as a vectorizer in order to increase the accuracy of the approach

In [30]:
# Extract the tags and labels from the training data
X_train = [' '.join(row['tags']) for row in training_data]
Y_train = [list(set(row['tags']) & set(main_tags))[0] for row in training_data]

# Vectorize the tags using the TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)

# Train a Naive Bayes classifier
clf = MultinomialNB().fit(X_train_tfidf, Y_train)

We are saving trained classifier in order to use it later

In [31]:
# Save the classifier to a file
joblib.dump(clf, 'naive_bayes_tags_classifier.joblib')

['naive_bayes_tags_classifier.joblib']

We are loading the saved classifier and run it to the whole dataset. The idea is the following: every element has tags column, now we run saved Bayes Classifier on this colum, in the input it has list of tags, in the output it predicts the most possible main tag from main tags list. We are saving the result in the new dataset

In [32]:
# Load the saved classifier from a file
clf = joblib.load('naive_bayes_tags_classifier.joblib')

# Read the CSV file and extract the tags column
with open('medium_articles.csv', 'r', encoding="utf8") as file:
    reader = csv.DictReader(file)
    rows = list(reader)

# Split the tags into a list and convert to lowercase
for row in rows:
    row['tags'] = [tag.lower() for tag in eval(row['tags'])]

# Vectorize the tags using the TfidfVectorizer
X = [' '.join(row['tags']) for row in rows]
vectorizer = TfidfVectorizer()
vectorizer.fit([' '.join(row['tags']) for row in training_data]) # fit the vectorizer on the training data
X_tfidf = vectorizer.transform(X) # use transform to apply the same vectorizer to the input data

# Predict the labels using the Naive Bayes classifier
Y = clf.predict(X_tfidf)

# Assign the predicted labels to a new column in the CSV file
header = reader.fieldnames + ['main_tag']
with open('medium_articles_predicted.csv', 'w', newline='', encoding="utf8") as file:
    writer = csv.DictWriter(file, fieldnames=header)
    writer.writeheader()
    for row, label in zip(rows, Y):
        row['main_tag'] = label
        writer.writerow(row)

In [33]:
df = pd.read_csv("medium_articles_predicted.csv")
df[1:4]

Unnamed: 0,title,text,url,authors,timestamp,tags,main_tag
1,Your Brain On Coronavirus,Your Brain On Coronavirus\n\nA guide to the cu...,https://medium.com/age-of-awareness/how-the-pa...,['Simon Spichak'],2020-09-23 22:10:17.126000+00:00,"['mental health', 'coronavirus', 'science', 'p...",health
2,Mind Your Nose,Mind Your Nose\n\nHow smell training can chang...,https://medium.com/neodotlife/mind-your-nose-f...,[],2020-10-10 20:17:37.132000+00:00,"['biotechnology', 'neuroscience', 'brain', 'we...",health
3,The 4 Purposes of Dreams,Passionate about the synergy between science a...,https://medium.com/science-for-real/the-4-purp...,['Eshan Samaranayake'],2020-12-21 16:05:19.524000+00:00,"['health', 'neuroscience', 'mental health', 'p...",health


Remove unecessary columns, switch places remained ones and rename them

In [34]:
# Remove column
columns_to_remove = ['title', 'url', 'authors', 'timestamp', 'tags']
df = df.drop(columns_to_remove, axis=1)

# Switch columns places
col1 = df['text']
col2 = df['main_tag']
df['text'] = col2
df['main_tag'] = col1

# Rename columns
df = df.rename(columns={'text': 'category', 'main_tag': 'text'})

# Save modified DataFrame to new CSV file
df.to_csv('medium_articles_predicted.csv', index=False)

In [35]:
# read in the CSV file
df = pd.read_csv('medium_articles_predicted.csv')
df[1:4]

Unnamed: 0,category,text
1,health,Your Brain On Coronavirus\n\nA guide to the cu...
2,health,Mind Your Nose\n\nHow smell training can chang...
3,health,Passionate about the synergy between science a...
