In [1]:
#!python -m spacy download en_core_web_trf
!python -m spacy download it_core_news_lg

Collecting it-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_lg-3.7.0/it_core_news_lg-3.7.0-py3-none-any.whl (567.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.9/567.9 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: it-core-news-lg
Successfully installed it-core-news-lg-3.7.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_lg')


In [3]:
import pandas as pd
import pathlib
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from string import punctuation

In [4]:
def remove_special_chars_and_urls(text):

    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[@#$%^&*()_+=\[\]{}|\\<>/~`]', '', text)
    text = re.sub(r'[^\w\s.,\'"!?]', '', text)
    return text.strip()

In [5]:
def remove_stop_words(text):

    stop_words = set(stopwords.words('italian'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]

    return ' '.join(filtered_sentence)

In [6]:
def extract_keyword(text):

    pos_tag = ['ADJ', 'NOUN']
    nlp = spacy.load("it_core_news_lg")
    #nlp = spacy.load("en_core_web_trf")
    doc = nlp(text.lower())
    keywords = [w.text for w in doc if w.pos_ in pos_tag]
    return ','.join(keywords)

In [7]:
def cleaning_data(df):

    df.drop_duplicates()
    tweets = df['full_text']
    tweets = tweets.apply(remove_special_chars_and_urls)

    return tweets

In [8]:
def keyword_extraction(tweets):


    tweets = tweets.apply(remove_stop_words)
    tweets = tweets.apply(extract_keyword)

    return tweets

In [10]:
if __name__ == "__main__":

    dataset_path = "../dataset/italian_tweets.csv"
    #dataset_path = "dataset/english_tweets.csv"
    dataset = pd.read_csv(dataset_path, sep=';')
    cleaned_tweets = cleaning_data(dataset)

    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('punkt_tab')

    keyword_tweets = keyword_extraction(cleaned_tweets)
    keyword_tweets.name = 'keyword'
    keyword_dataset = pd.concat([dataset, keyword_tweets], axis=1)
    keyword_dataset.to_csv('dataset/keyword_italian_tweets.csv',sep=';')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/lucianoimbimbo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lucianoimbimbo/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/lucianoimbimbo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
