In [1]:
import pandas as pd

In [None]:
all_data = pd.read_excel("combined_companies.xlsx")
final_merged_all_news = pd.read_excel("final_headlines.xlsx")

In [None]:
final_merged_all_news.shape

In [None]:
final_merged_all_news = final_merged_all_news.drop('Unnamed: 0', axis =1)

In [None]:
final_merged_all_news["RIC"].nunique()

In [None]:
all_data.shape

In [None]:
all_data["Instrument"].nunique()

In [None]:
all_data['Label'].value_counts()

In [None]:
all_data = all_data.drop('Unnamed: 0', axis =1)

In [None]:
# Standardize column names
final_merged_all_news.rename(columns={'RIC': 'Instrument'}, inplace=True)

# Ensure consistency in case and whitespace for merging
final_merged_all_news["Instrument"] = final_merged_all_news["Instrument"].str.strip().str.upper()
all_data["Instrument"] = all_data["Instrument"].str.strip().str.upper()

# Find common companies in both datasets
common_companies = set(final_merged_all_news["Instrument"]).intersection(set(all_data["Instrument"]))

# Filter both datasets to keep only common companies
final_merged_all_news = final_merged_all_news[final_merged_all_news["Instrument"].isin(common_companies)]
all_data = all_data[all_data["Instrument"].isin(common_companies)]
# Merge datasets based on Instrument (INNER JOIN to keep only common companies)
merged_df = final_merged_all_news.merge(all_data[['Instrument', 'Label']], on='Instrument', how='left')

# Drop rows with missing headlines
merged_df = merged_df.dropna(subset=['Headlines'])

# Check the number of companies retained
print(f"Number of common companies retained: {merged_df['Instrument'].nunique()}")
print(f"Number of common companies retained: {all_data['Instrument'].nunique()}")
# Check label distribution
print("Label Distribution in Merged Data:\n", merged_df["Label"].value_counts())

In [None]:
all_data = all_data.drop('AD',axis = 1)

In [None]:
all_data.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Prepare numerical features (excluding company identifier and target label)
X_numerical = all_data.drop(columns=["Instrument","Label"])  # Keep only numerical indicators
y_numerical = all_data["Label"]  # Target variable (1 = M&A, 0 = Non-M&A)

# Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_numerical, y_numerical, test_size=0.2, random_state=42)

# Train SVM Model with a linear kernel
svm_model = SVC(kernel="linear", probability=True)
svm_model.fit(X_train, y_train)

# Make Predictions
y_pred = svm_model.predict(X_test)

# Model Evaluation: Accuracy and Classification Report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print Evaluation Metrics
print(f"Model Accuracy on Numerical Dataset: {accuracy:.4f}")
print("Classification Report:\n", classification_rep)