### Extract year from one file

In [64]:
import re

def get_year(filename):
    year = re.findall('(\d{4}).*', filename)[0]
    year = int(year)
    return year

filename = 'PENNIES/1960_s/1960PennyLincolnUp.LampE5.5.200Scan.07172019.P1.ChangedAngle_HRD10591_13-10-22-820.txt'
get_year(filename)    

1960

### Extract data from one file

In [80]:
import pandas as pd

def get_data(filename):
    df = pd.read_csv(filename, skiprows=13, sep='\t', names=['freq', 'intensity'])
    df = df.set_index('freq')
    return df

df = get_data(filename)
df.head()

Unnamed: 0_level_0,intensity
freq,Unnamed: 1_level_1
223.165,-15.62
223.4,-15.62
223.635,-15.62
223.869,-5.62
224.104,-12.62


### Process all files

In [81]:
from pathlib import Path

intensities = []
years = []

for filename in Path('PENNIES').glob('**/*.txt'):
    year = get_year(filename.name)
    years.append(year)
    df = get_data(filename)
    intensities.append(df['intensity'])

In [85]:
df = pd.concat(intensities, axis=1).transpose()
df.shape, len(years)

((6210, 2048), 6210)

### Train and test models 

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

classifiers = [
    KNeighborsClassifier(3),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    SVC(kernel="linear", C=0.025),
]

X = df.values
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = years
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
for clf in classifiers:
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    name = clf.__class__.__name__
    print(name, score)

KNeighborsClassifier 0.9146537842190016
SVC 0.9859098228663447
SVC 0.06521739130434782
DecisionTreeClassifier 0.23752012882447665
RandomForestClassifier 0.9432367149758454
MLPClassifier 0.9710144927536232
AdaBoostClassifier 0.0966183574879227
GaussianNB 0.5668276972624798


### Test a new file

In [140]:
filename = 'PENNIES/1980_s/1986.PennyLincolnUp.Lamp.En5.5.200Scan.071819.P1_HRD10591_17-08-12-973.txt'
intensities = get_data(filename).transpose()
intensities = scaler.transform(intensities)
year = clf.predict(intensities)[0]
year

1986