In [1]:
import pandas as pd

#Loading Twitter Dataset
twitter = pd.read_csv("data/raw/twitter.csv", encoding="latin1")

#Loading News Dataset line by line seperated either by a semicolon or comma
rows = []
with open("data/raw/news.csv", encoding="latin1") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        if ";" in line:
            parts = line.split(";", 1)
        else:
            parts = line.split(",", 1)

        if len(parts) != 2:
            continue

        sentiment, text = parts
        rows.append([sentiment.strip(), text.strip()])
news = pd.DataFrame(rows, columns=["sentiment", "text"])

#Loading Reports Dataset
reports = pd.read_csv("data/raw/reports.csv", encoding="latin1")



In [11]:
#Printing Twitter Rows and Columns
print(twitter.head())
print(twitter.columns)

#Printing News Rows and Columns
print(news.head())
print(news.columns)

#Printing Reports Rows and Columns
print(reports.head())
print(reports.columns)

                                                Text  Sentiment  label
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...          1      2
1  user: AAP MOVIE. 55% return for the FEA/GEED i...          1      2
2  user I'd be afraid to short AMZN - they are lo...          1      2
3                                  MNTA Over 12.00            1      2
4                                   OI  Over 21.37            1      2
Index(['Text', 'Sentiment', 'label'], dtype='object')
  sentiment                                               text Sentiment
0   neutral  "According to Gran , the company has no plans ...   neutral
1   neutral  "Technopolis plans to develop in stages an are...   neutral
2  negative  "The international electronic industry company...  negative
3  positive  With the new production plant the company woul...  positive
4  positive  "According to the company 's updated strategy ...  positive
Index(['sentiment', 'text', 'Sentiment'], dtype='object')
                        

In [12]:
#Fixing Rows of News
def clean_sentiment_text(row):
    s = row["sentiment"]
    t = row["text"]

    # If sentiment accidentally contains text
    if "," in s:
        parts = s.split(",", 1)
        return parts[0].strip(), parts[1].strip()

    return s.strip(), t.strip()

news[["Sentiment", "text"]] = news.apply(
    lambda row: pd.Series(clean_sentiment_text(row)),
    axis=1
)

In [13]:
#Printing Sentiment Counts
print(news["Sentiment"].value_counts())
#Printing Sentiment Counts
print(reports["Sentiment"].value_counts())
#Printing Sentiment Counts
print(twitter["Sentiment"].value_counts())

print("Twitter columns:", twitter.columns.tolist())
print("News columns:", news.columns.tolist())
print("Reports columns:", reports.columns.tolist())

Sentiment
neutral     2879
positive    1363
negative     604
Name: count, dtype: int64
Sentiment
neutral     3130
positive    1852
negative     860
Name: count, dtype: int64
Sentiment
 1    3685
-1    2106
Name: count, dtype: int64
Twitter columns: ['Text', 'Sentiment', 'label']
News columns: ['sentiment', 'text', 'Sentiment']
Reports columns: ['Sentence', 'Sentiment']


In [14]:
# Twitter Standardisation
twitter["label"] = twitter["Sentiment"].map({1: 2, -1: 0})
twitter = twitter.rename(columns={"Text": "text"})[["text", "label"]]

# News Standardisation
news["label"] = news["sentiment"].str.lower().map({
    "negative": 0,
    "neutral": 1,
    "positive": 2
})
news = news[["text", "label"]]

# Reports Standardisation
reports["label"] = reports["Sentiment"].str.lower().map({
    "negative": 0,
    "neutral": 1,
    "positive": 2
})
reports = reports.rename(columns={"Sentence": "text"})[["text", "label"]]

In [15]:
print(twitter.columns)
print(news.columns)
print(reports.columns)

Index(['text', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')
Index(['text', 'label'], dtype='object')


In [16]:
#Printing Twitter Rows and Columns
print(twitter.head())
print(twitter.columns)

#Printing News Rows and Columns
print(news.head())
print(news.columns)

#Printing Reports Rows and Columns
print(reports.head())
print(reports.columns)

                                                text  label
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...      2
1  user: AAP MOVIE. 55% return for the FEA/GEED i...      2
2  user I'd be afraid to short AMZN - they are lo...      2
3                                  MNTA Over 12.00        2
4                                   OI  Over 21.37        2
Index(['text', 'label'], dtype='object')
                                                text  label
0  "According to Gran , the company has no plans ...      1
1  "Technopolis plans to develop in stages an are...      1
2  "The international electronic industry company...      0
3  With the new production plant the company woul...      2
4  "According to the company 's updated strategy ...      2
Index(['text', 'label'], dtype='object')
                                                text  label
0  The GeoSolutions technology will leverage Bene...      2
1  $ESI on lows, down $1.50 to $2.50 BK a real po...      0
2  For the last qu

In [19]:
#Saving Cleaned Data
twitter.to_csv("data/clean/twitter.csv", index=False)
news.to_csv("data/clean/news.csv", index=False)
reports.to_csv("data/clean/reports.csv", index=False)

In [20]:
#Using Cleaned Data
twitter = pd.read_csv("data/clean/twitter.csv")
news = pd.read_csv("data/clean/news.csv")
reports = pd.read_csv("data/clean/reports.csv")


In [21]:
print(twitter.head(2))
print(news.head(2))
print(reports.head(2))

                                                text  label
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...      2
1  user: AAP MOVIE. 55% return for the FEA/GEED i...      2
                                                text  label
0  "According to Gran , the company has no plans ...      1
1  "Technopolis plans to develop in stages an are...      1
                                                text  label
0  The GeoSolutions technology will leverage Bene...      2
1  $ESI on lows, down $1.50 to $2.50 BK a real po...      0


In [23]:
#Function to Remove Unwanted Characters
import re

def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)      # remove URLs
    text = re.sub(r"@\w+", "", text)         # remove mentions
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [24]:
#Removing Unwanted Characters
for df in [twitter, news, reports]:
    df["text"] = df["text"].apply(clean_text)


In [25]:
#Checking wether removal did not cause any issues
print(twitter.sample(5))
print(news.sample(5))
print(reports.sample(5))

                                                   text  label
1192  user i'd buy mon. i'm waiting for a more serio...      2
1010  shd - appearance of a 180 bullish cup and hand...      2
2087  s. jobs was keeping money to innovate and buy ...      0
1242  athx doesn't like 1.35 but i do staying long f...      2
3983  jpm bounced off it's fib support level today too.      0
                                                   text  label
1965  "`` this is super nasty , '' said eric schultz...      0
2545  "poyry plc additional information by : heikki ...      1
3401  the value of the total investment is about eur...      1
3572  earnings per share eps in 2005 amounted to a l...      1
4205  incap furniture is presently negotiating staff...      0
                                                   text  label
4495  the writing and publication of lemmink+ã±inen ...      1
3154  the total emissions reduction from 1999 to 201...      1
1969  cablevision systems corp. cvc their madison sq...

In [26]:
twitter.to_csv("data/processed/twitter.csv", index=False)
news.to_csv("data/processed/news.csv", index=False)
reports.to_csv("data/processed/reports.csv", index=False)