**SOURCES**

https://pandas.pydata.org/docs/

https://www.nltk.org/

**DATA CLEANING**

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('data/trump1_tidy.csv', encoding = 'latin1')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
#if there is no columns names
"""num_columns = len(df.columns)
column_names = [f'Kolumna_{i+1}' for i in range(num_columns)]
df.columns = column_names"""

In [None]:
#setting columns to keep
columns_to_keep = [
    'Tweet'
    #'Kolumna_6'
]
df_filtered = df[columns_to_keep]

In [None]:
df_filtered.head()

In [None]:
print(df_filtered['Kolumna_6'].dtype)

In [None]:
df_filtered.loc[:,'Kolumna_6'] = df_filtered['Kolumna_6'].astype(str)

In [None]:
#import nltk
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('vader_lexicon')

In [None]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
#tweets cleaning function
def clean_tweet(text):
    #lowercase text
    text = text.lower()
    #remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    #remove special words
    text = re.sub(r'[^a-z\s]', '', text)
    #remove two letter words
    text = re.sub(r'\b\w{1,2}\b', '', text)
    #remove links
    text = re.sub(r'https?:\/\/\S*|http\S*', '', text)
    #lstrip
    text = text.lstrip(' ')
    #split words
    text = ' '.join([word for word in text.split() if word not in ENGLISH_STOP_WORDS])
    return text
    

In [None]:
df_filtered.loc[:,'Cleaned_Tweets'] = df_filtered['Tweet'].apply(clean_tweet)

In [None]:
df_filtered.head()

In [None]:
#tokenization
def tokenize_text(text):
    return word_tokenize(text)

In [None]:
df_filtered['Tokens'] = df_filtered['Cleaned_Tweets'].apply(tokenize_text)

In [None]:
df_filtered.head()

In [None]:
#lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_token_list(token_list):
    return [lemmatizer.lemmatize(token) for token in token_list]

In [None]:
df_filtered['Lemmas'] = df_filtered['Tokens'].apply(lemmatize_token_list)

In [None]:
df_filtered.head()

In [None]:
def clean_lemmas(lemmas):
    cleaned_lemmas = [lemma for lemma in lemmas if re.match(r'^\w+$', lemma)]
    return cleaned_lemmas

In [None]:
df_filtered.loc[:,'Cleaned_Lemmas'] = df_filtered['Lemmas'].apply(clean_lemmas)

In [None]:
df_filtered['Lemmas_Text'] = df_filtered['Cleaned_Lemmas'].apply(lambda lemmas: ' '.join(lemmas))

In [None]:
df_filtered.head()

In [None]:
#choosing columns to keep
columns_to_keep = [
    'Tweet',
    'Cleaned_Tweets',
    'Tokens',
    'Lemmas',
    'Cleaned_Lemmas',
    'Lemmas_Text',
]
df_eval = df_filtered[columns_to_keep]

In [None]:
df_eval.head()

**DATA EVALUATION**

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
#evaluation function
sid = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    scores = sid.polarity_scores(text)
    return scores['compound']

df_eval.loc[:,'vader_sentiment'] = df_eval['Lemmas_Text'].apply(vader_sentiment)

In [None]:
#checkpoint
df_eval.to_csv('testtrump.csv', index = False)

In [None]:
df_eval.head(5)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
#distribution of sentiment
sns.histplot(df_eval['vader_sentiment'], bins = 20)
plt.title('Rozkład wyników sentymentu Vader')
plt.show()

In [None]:
#mean value for overall sentiment
mean_vader = df_eval['vader_sentiment'].mean()
print("Średni wynik: ", mean_vader)

In [None]:
#describe for bar plot
df_eval.loc[:,'vader_sentiment'] = pd.cut(df_eval['vader_sentiment'], bins=[-float("inf"), -0.1, 0.1, float("inf")], labels=['negative', 'neutral', 'positive'])

In [None]:
#bar plot
sentiment_counts = df_eval['vader_sentiment'].value_counts()
sentiment_counts.plot(kind = 'bar', color = ['red', 'blue', 'green'])

In [None]:
#summary for counting
negative_count = df_eval['vader_sentiment'].value_counts().get('negative', 0)
neutral_count = df_eval['vader_sentiment'].value_counts().get('neutral', 0)
positive_count = df_eval['vader_sentiment'].value_counts().get('positive', 0)
print("Total negatives: ", negative_count,
     "\nTotal neutrals: ", neutral_count,
     "\nTotal positives: ", positive_count,
     "\nTotal: ", negative_count + neutral_count + positive_count)

In [None]:
#final saves
df_eval.to_csv('set1.csv', index = False)