In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

CONCISE = False
TEST_RATIO = 0.10

# Load the dataset
filename = 'troop_movements.csv'
data = pd.read_csv(filename)
df = pd.DataFrame(data)

# Clean data
# df = df[df['unit_type'] != 'unknown']

if CONCISE:
    display(df.head())
else:
    display(df)

In [None]:
count_by_alignment = df.groupby('empire_or_resistance').size().reset_index(name='count')
display(count_by_alignment)

In [None]:
count_by_homeworld = df.groupby('homeworld').size().reset_index(name='count')
if CONCISE:
    display(count_by_homeworld.head())
else:
    display(count_by_homeworld)
print('Total Count: %d' % np.sum(count_by_homeworld['count']))

In [None]:
count_by_unit_type = df.groupby('unit_type').size().reset_index(name='count')
if CONCISE:
    display(count_by_unit_type.head())
else:
    display(count_by_unit_type)
print('Total Count: %d' % np.sum(count_by_unit_type['count']))

In [None]:
df['is_resistance'] = (df['empire_or_resistance'] == 'resistance')
if CONCISE:
    display(df.head())
else:
    display(df)

In [None]:
sns.set_theme(style="whitegrid")
sns_palette = sns.color_palette('muted')

ax =  sns.barplot(x="empire_or_resistance", y="count", palette=[sns_palette[1], sns_palette[0]],
                  hue="empire_or_resistance", data=count_by_alignment)
plt.title("Character Count by Empire or Resistance", y=1.05)
plt.xlabel('Empire or Resistance')
plt.ylabel('Count')
for p in ax.patches:
    ax.annotate(f'{int(p.get_height()):,}',
                (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', fontsize=10, color='black',
                rotation=45, xytext=(0, 10), textcoords='offset points')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Define features (X) and target variable (y)
X = df[['unit_type', 'homeworld']]
X_enc = pd.get_dummies(X)
y = df['is_resistance']

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, test_size=TEST_RATIO, random_state=1977)

# Train the classifier
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

# Predict alignment for test set
y_pred = clf.predict(X_test)

# Calculate model accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Convert categorical features to numeric
X_enc = pd.get_dummies(X)

# Create a bar plot that shows feature importance

importances = clf.feature_importances_

feature_importances = pd.DataFrame({
    'Feature' : X_enc.columns,
    'Importance' : importances
}).sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importances, hue='Feature', palette='viridis')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout(pad=-10.0)
plt.show()

# Save the model as trained_model.pkl
import pickle
model_filename = 'trained_model.pkl'
with open(model_filename, 'wb') as model_file:
    pickle.dump(clf, model_file)

# Most Influential Features
print("Most Influential Feature:", feature_importances['Feature'].iloc[0])