In [71]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re 

In [53]:
def has_number(string):
    return any(char.isdigit() for char in string)

In [37]:
data = []
with open("archival_data.txt", 'r') as f:
    lines = f.readlines()
    
lines = [line.split() for line in lines]

# Adjust for Color Names
for line in lines:
    if len(line) == 8:
        # Add a space between 4 and 5 if want later :
        line[4] = line[4] + line[5]
        line[5] = line[6]
        line[6] = line[7]
        del line[7]

In [94]:
# g is the correct output, ie the ship's planet of origin
g = train_df["planet"]


In [113]:
train_df = pd.DataFrame(lines, columns=["name","planet","warp","transponder","surface","axis_ratio", "action"])

# BEGIN FEATURE ENGINEERING
# Drop the Action and Planet of Origin columns
train_df = train_df.drop(['action', 'planet'], axis=1)

# CATEGORICAL FEATURE ENGINEERING
# One Hot Encode the Surface features (categorical) OR SHOULD I USE LABEL ENCODER??
surface_features = pd.get_dummies(train_df['surface'], dummy_na= True, prefix='surface', drop_first=True)
train_df = pd.concat([train_df, surface_features], axis=1).drop(['surface'],axis=1)

# Manually encode Name features:
# If name has digits, if first character is A or E, first two are consonants, contains double L
numeric_name = []
firstAE = []
first_consonants = []
double_l = []
for name in train_df['name']:
    # Check if name has digits [Antarean]
    if has_number(name):
        numeric_name.append(1)
    else:
        numeric_name.append(0)
    # Check if name starts with A or E [Antarean]
    if re.match(r'[AE].+', name):
        firstAE.append(1)
    else:
        firstAE.append(0)
    # Check if name starts with two consonants [Klingon]
    if re.match(r'^[^aeiouAEIOU]{2}.+', name):
        first_consonants.append(1)
    else:
        first_consonants.append(0)
    # Check if the name as double 'l' [Romulan]
    if re.match(r'.+[l]{2}.+', name):
        double_l.append(1)
    else:
        double_l.append(0)
            
train_df['numeric_name'] = np.where(numeric_name, 1, 0)
train_df['firstAE'] = np.where(firstAE, 1, 0)
train_df['first_consonants'] = np.where(first_consonants, 1, 0)
train_df['double_l'] = np.where(double_l, 1, 0)

In [114]:
# Numerical Feature Encoding : 
# Normalize the numeric values
# Xn = (X- Xmin)/(Xmaz - Xmin)

# Convert strings to floats in numeric columns
train_df['warp'] = [ float(w) for w in train_df['warp'] ]
train_df['transponder'] = [ float(t) for t in train_df['transponder'] ]
train_df['axis_ratio'] = [ float(a) for a in train_df['axis_ratio']]

min_warp = train_df['warp'].min()
max_warp = train_df['warp'].max()
train_df['norm_warp'] = (train_df['warp'] - min_warp) / (max_warp - min_warp)

min_t = train_df['transponder'].min()
max_t = train_df['transponder'].max()
train_df['norm_transpond'] = (train_df['transponder'] - min_t) / (max_t - min_t)

min_t = train_df['axis_ratio'].min()
max_t = train_df['axis_ratio'].max()
train_df['norm_axis'] = (train_df['axis_ratio'] - min_t) / (max_t - min_t)

# Finally, drop the name, warp, transponder, and axis_ratio columns
train_df = train_df.drop(['name', 'warp', 'transponder', 'axis_ratio'], axis=1)
train_df

Unnamed: 0,surface_DarkBlue,surface_DarkGray,surface_DarkGreen,surface_LightBlue,surface_LightGray,surface_Orange,surface_Pink,surface_White,surface_Yellow,surface_nan,numeric_name,firstAE,first_consonants,double_l,norm_warp,norm_transpond,norm_axis
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.5,0.494505,1.0
1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.6,0.392223,0.52
2,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.9,0.255283,0.72
3,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0.7,0.486052,0.8
4,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0.7,0.455621,0.0
5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.9,0.274725,0.24
6,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1.0,0.247675,0.32
7,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0.9,0.0,0.36
8,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.8,0.066779,0.44
9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.0,0.108199,0.52
