In [1]:
import pandas as pd
import numpy as np
import os 

In [2]:
df = pd.read_csv("articles.csv")
df.head()

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...
2,William Koehrsen,2.8K,11,https://towardsdatascience.com/automated-featu...,Automated Feature Engineering in Python – Towa...,Machine learning is increasingly moving from h...
3,Gant Laborde,1.3K,7,https://medium.freecodecamp.org/machine-learni...,Machine Learning: how to go from Zero to Hero ...,If your understanding of A.I. and Machine Lear...
4,Emmanuel Ameisen,935,11,https://blog.insightdatascience.com/reinforcem...,Reinforcement Learning from scratch – Insight ...,Want to learn about applied Artificial Intelli...


In [3]:
import langdetect

def detect_language(text):
    return langdetect.detect(text)

df = pd.read_csv('articles.csv')
print("Nombre d'inputs avant suppression des textes non anglais : " + str(len(df)))

# créer une liste vide qui contiendra les indices des lignes à supprimer
indexes_to_delete = []

# parcourir chaque ligne du dataframe
for index, row in df.iterrows():

    lang = detect_language(row['text'][0:200])
    # si la valeur de l'attribut 'text' commence par un 'm'
    if lang != 'en':
    # ajouter l'index de cette ligne à la liste des indices à supprimer
        indexes_to_delete.append(index)

# supprimer les lignes du dataframe en utilisant la liste des indices à supprimer
df.drop(indexes_to_delete, inplace=True)

print("Nombre d'inputs après suppression des textes non anglais : " + str(len(df)))

Nombre d'inputs avant suppression des textes non anglais : 337
Nombre d'inputs après suppression des textes non anglais : 331


### PREPROCESSING


In [4]:
# REMOVE PUNCTUATION
import string

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

df["text"] = df["text"].apply(lambda row :remove_punctuation(row))
df["text"] = df["text"].apply(lambda row: row.lower())

In [5]:
# TOKENIZATION
import nltk
import nltk.corpus
from nltk.tokenize import word_tokenize

df["text"] = df["text"].apply(lambda x : word_tokenize(x))

In [6]:
# REMOVE STOP WORDS
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
df["text"] = df["text"].apply(lambda x: [w for w in x if not w in stop_words])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leolamoureux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# LEMMATIZATION
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

df["text"] = df["text"].apply(lambda x: [WordNetLemmatizer().lemmatize(w) for w in x])

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leolamoureux/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
import re
import enchant

# Create a UK English dictionary using enchant
dictionary = enchant.Dict("en_GB")

# Define a regular expression pattern to match non-alphabetic characters
pattern = re.compile(r'[^a-zA-Z]')

# Remove all non-English words from the documents
filtered_documents = []
for doc in df["text"]:
    filtered_doc = []
    for word in doc:
        # Remove non-alphabetic characters from the word
        word = pattern.sub('', word)
        # Check if the word is in the English dictionary
        if word and dictionary.check(word):
            filtered_doc.append(word)
    filtered_documents.append(filtered_doc)

### TF-IDF

In [12]:
unique_words = set()

# Iterate over each list of words
for words in filtered_documents:
# Add the unique words from this list to the set
    for word in words:
        unique_words.add(word)

# Convert the set of unique words back into a list
unique_words = list(unique_words)

In [None]:
from collections import Counter
#optimize
def create_word_count_dataframe(words, lists_of_words):
  # Create an empty data frame with the columns for each word
  df = pd.DataFrame(columns=words)

  # Create a list to store the rows of word counts
  rows = []

  # Iterate over each list of words
  for words_list in lists_of_words:
    # Use the Counter class to count the occurrences of each word
    word_counts = dict(Counter(words_list))

    # Ensure that the word_counts dictionary includes a count for every word
    row = {word: word_counts.get(word, 0) for word in words}

    # Add the word counts for this list of words to the list of rows
    rows.append(row)

  # Concatenate the rows into the data frame
  df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)

  return df

In [None]:
df_words_count = create_word_count_dataframe(unique_words, df["text"])

In [None]:
from math import log

def create_tfidf_dataframe(df):
  # Create a new data frame with the same index as the original data frame (i.e., the words)
  tfidf_df = pd.DataFrame(index=df.columns)

  # Add a column for the total number of documents (i.e., the number of rows in the data frame)
  #tfidf_df['tfidf'] = df.shape[0]

  # Add a column for the document frequency of each word
  tfidf_df['df'] = (df > 0).sum(axis=0)

  # Add a column for the inverse document frequency of each word
  tfidf_df['idf'] = tfidf_df['df'].apply(lambda x: log(df.shape[0] / x))

  # Add a column for the TF-IDF of each word
  tfidf_df['tfidf'] = tfidf_df['df'] * tfidf_df['idf']

  return tfidf_df

In [None]:
print(create_tfidf_dataframe(df_words_count))

                 df       idf       tfidf
tweaked           4  4.415824   17.663296
cocktail          4  4.415824   17.663296
brightly          1  5.802118    5.802118
responds          3  4.703506   14.110518
conceptualizes    1  5.802118    5.802118
...             ...       ...         ...
peering           1  5.802118    5.802118
constantly       28  2.469914   69.157588
honeymoon         2  5.108971   10.217942
actually        146  0.818512  119.502716
godfather         5  4.192680   20.963402

[17974 rows x 3 columns]


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer object
vectorizer = TfidfVectorizer()
documents = [' '.join(doc) for doc in filtered_documents]

# Fit the vectorizer on the documents and transform them into TF-IDF vectors
tfidf = vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names()

# Create a DataFrame from the TF-IDF vectors
tfidf_df = pd.DataFrame(tfidf.toarray(), columns=feature_names)

# Print the resulting DataFrame
print(tfidf_df)

     aah   ab  aback  abandoned  abbreviated  abilities   ability  ablation  \
0    0.0  0.0    0.0        0.0          0.0   0.000000  0.000000       0.0   
1    0.0  0.0    0.0        0.0          0.0   0.000000  0.019030       0.0   
2    0.0  0.0    0.0        0.0          0.0   0.000000  0.000000       0.0   
3    0.0  0.0    0.0        0.0          0.0   0.000000  0.000000       0.0   
4    0.0  0.0    0.0        0.0          0.0   0.000000  0.000000       0.0   
..   ...  ...    ...        ...          ...        ...       ...       ...   
326  0.0  0.0    0.0        0.0          0.0   0.000000  0.000000       0.0   
327  0.0  0.0    0.0        0.0          0.0   0.052112  0.030109       0.0   
328  0.0  0.0    0.0        0.0          0.0   0.000000  0.026217       0.0   
329  0.0  0.0    0.0        0.0          0.0   0.000000  0.000000       0.0   
330  0.0  0.0    0.0        0.0          0.0   0.000000  0.000000       0.0   

         able  abnormal  ...  zeroth  zest  zigzag 

