<a href="https://colab.research.google.com/github/harshil0217/NFL_Rookie_Comps/blob/main/ProfileEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [73]:
import pandas as pd
import numpy as np
from google.colab import auth
from google.cloud import storage
import io
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModelForSequenceClassification
from transformers import pipeline
import torch
from torch.utils.data import Dataset, DataLoader
auth.authenticate_user()



In [74]:
client = storage.Client(project='cmse-381')
bucket = client.get_bucket('stats_draftprospects')
blob = bucket.blob('draft_profiles.csv')
content = blob.download_as_string()
profiles = pd.read_csv(io.BytesIO(content))

In [75]:
profiles

Unnamed: 0,player_name,draft_profile,draft_year,pos_abbr,school_abbr,pick,overall,round,team_abbr,weight,height,player_image
0,Alex Smith,Nephew of Michigan State head coach John L. Sm...,2005,QB,UTAH,1.0,1.0,1.0,SF,217.0,76.0,
1,Ronnie Brown,Brown has played second fiddle to Carnell Will...,2005,RB,AUB,2.0,2.0,1.0,MIA,233.0,72.0,
2,Braylon Edwards,"Edwards' father, Stanley, played at Michigan a...",2005,WR,MICH,3.0,3.0,1.0,CLE,211.0,75.0,
3,Cedric Benson,Benson was drafted by the Los Angeles Dodgers ...,2005,RB,TEX,4.0,4.0,1.0,CHI,222.0,71.0,
4,Carnell Williams,Williams started two games and played in nine ...,2005,RB,AUB,5.0,5.0,1.0,TB,217.0,71.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
4021,Jermar Jefferson,Jefferson is an efficient back who follows and...,2021,RB,ORST,30.0,257.0,7.0,DET,206.0,70.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
4022,Dax Milne,Milne is a savvy route runner with average bur...,2021,WR,BYU,31.0,258.0,7.0,WSH,193.0,73.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
4023,Grant Stuard,Stuard is an undersized off-the-ball linebacke...,2021,OLB,HOU,32.0,259.0,7.0,TB,225.0,72.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
4024,Robert Jones,Jones started games at guard at the junior co...,1992,LB,ECU,24.0,24.0,1.0,DAL,,,


In [76]:
#use BERT-NER to remove Person, Organization, Place, Date, and Time


device = 0 if torch.cuda.is_available() else -1

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

class ProfileDataset(Dataset):
    def __init__(self, profiles):
        self.profiles = profiles
        self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, device=device, batch_size=32, aggregation_strategy="simple")

    def __len__(self):
        return len(self.profiles)

    def __getitem__(self, idx):
        draft_profile = self.profiles.iloc[idx]['draft_profile']
        processed_profile = self.remove_entities(draft_profile)
        return processed_profile

    def remove_entities(self, text):
      entities = self.nlp(text)
      for entity in entities:
        text = text.replace(entity['word'], '')
      return text

dataset = ProfileDataset(profiles)
dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

processed_profiles = []
for batch in dataloader:
    processed_profiles.extend(batch)

profiles['draft_profile'] = processed_profiles

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [77]:
profiles

Unnamed: 0,player_name,draft_profile,draft_year,pos_abbr,school_abbr,pick,overall,round,team_abbr,weight,height,player_image
0,Alex Smith,Nephew of head coach . attempted only five ...,2005,QB,UTAH,1.0,1.0,1.0,SF,217.0,76.0,
1,Ronnie Brown,has played second fiddle to nell throughout ...,2005,RB,AUB,2.0,2.0,1.0,MIA,233.0,72.0,
2,Braylon Edwards,"' father, , played at and in the with the a...",2005,WR,MICH,3.0,3.0,1.0,CLE,211.0,75.0,
3,Cedric Benson,was drafted by the in the th round out of hi...,2005,RB,TEX,4.0,4.0,1.0,CHI,222.0,71.0,
4,Carnell Williams,started two games and played in nine in befo...,2005,RB,AUB,5.0,5.0,1.0,TB,217.0,71.0,
...,...,...,...,...,...,...,...,...,...,...,...,...
4021,Jermar Jefferson,is an efficient back who follows and reads bl...,2021,RB,ORST,30.0,257.0,7.0,DET,206.0,70.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
4022,Dax Milne,is a savvy route runner with average burst an...,2021,WR,BYU,31.0,258.0,7.0,WSH,193.0,73.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
4023,Grant Stuard,uard is an undersized off-the-ball linebacker ...,2021,OLB,HOU,32.0,259.0,7.0,TB,225.0,72.0,https://a.espncdn.com/i/headshots/nfldraft/pla...
4024,Robert Jones,started games at guard at the junior college...,1992,LB,ECU,24.0,24.0,1.0,DAL,,,


In [78]:
# add sentiment column with bert, using default bert tokenizer

tokenizer_sentiment = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english')
model_sentiment = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english')
sentiment_pipeline = pipeline('sentiment-analysis', model=model_sentiment, tokenizer=tokenizer_sentiment, device = device)

#ge5 label
def get_sentiment(text):
    text = text[:512]
    sentiment = sentiment_pipeline(text)
    if sentiment[0]['label'] == 'NEGATIVE':
        return -1
    else:
        return str(sentiment[0]['score'])


profiles['sentiment'] = profiles['draft_profile'].apply(get_sentiment)



In [79]:
profiles

Unnamed: 0,player_name,draft_profile,draft_year,pos_abbr,school_abbr,pick,overall,round,team_abbr,weight,height,player_image,sentiment
0,Alex Smith,Nephew of head coach . attempted only five ...,2005,QB,UTAH,1.0,1.0,1.0,SF,217.0,76.0,,-1
1,Ronnie Brown,has played second fiddle to nell throughout ...,2005,RB,AUB,2.0,2.0,1.0,MIA,233.0,72.0,,-1
2,Braylon Edwards,"' father, , played at and in the with the a...",2005,WR,MICH,3.0,3.0,1.0,CLE,211.0,75.0,,0.9977142810821533
3,Cedric Benson,was drafted by the in the th round out of hi...,2005,RB,TEX,4.0,4.0,1.0,CHI,222.0,71.0,,0.9970932006835938
4,Carnell Williams,started two games and played in nine in befo...,2005,RB,AUB,5.0,5.0,1.0,TB,217.0,71.0,,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4021,Jermar Jefferson,is an efficient back who follows and reads bl...,2021,RB,ORST,30.0,257.0,7.0,DET,206.0,70.0,https://a.espncdn.com/i/headshots/nfldraft/pla...,-1
4022,Dax Milne,is a savvy route runner with average burst an...,2021,WR,BYU,31.0,258.0,7.0,WSH,193.0,73.0,https://a.espncdn.com/i/headshots/nfldraft/pla...,0.9981149435043335
4023,Grant Stuard,uard is an undersized off-the-ball linebacker ...,2021,OLB,HOU,32.0,259.0,7.0,TB,225.0,72.0,https://a.espncdn.com/i/headshots/nfldraft/pla...,0.9987921118736267
4024,Robert Jones,started games at guard at the junior college...,1992,LB,ECU,24.0,24.0,1.0,DAL,,,,-1


In [80]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [81]:
embeddings = model.encode(profiles['draft_profile'].tolist())

In [82]:
embeddings

array([[-0.02400924,  0.02246189, -0.07962109, ...,  0.02992326,
         0.02146961,  0.0603883 ],
       [-0.04543393, -0.03509943, -0.02980237, ..., -0.04981454,
         0.00856797,  0.05311377],
       [-0.03942122, -0.0639299 , -0.02087854, ..., -0.01501575,
        -0.01959316,  0.05965216],
       ...,
       [ 0.0116369 , -0.03493826, -0.01862857, ..., -0.02657516,
         0.04190882,  0.01508302],
       [ 0.00370188,  0.03022077, -0.07987583, ..., -0.05268501,
        -0.01219928,  0.0610402 ],
       [-0.0211548 ,  0.02376484, -0.06796203, ..., -0.00535694,
         0.01767545,  0.0520233 ]], dtype=float32)

In [83]:
similarity_matrix = cosine_similarity(embeddings)








In [84]:
similarity_df = pd.DataFrame(similarity_matrix, index=profiles['player_name'], columns=profiles['player_name'])

In [85]:
similarity_df

player_name,Alex Smith,Ronnie Brown,Braylon Edwards,Cedric Benson,Carnell Williams,Adam Jones,Troy Williamson,Antrel Rolle,Carlos Rogers,Mike Williams,...,Chris Garrett,Marquiss Spencer,Pressley Harvin III,Kawaan Baker,Kylin Hill,Jermar Jefferson,Dax Milne,Grant Stuard,Robert Jones,Tim Jones
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alex Smith,1.000000,0.476609,0.565490,0.630389,0.693408,0.665695,0.635621,0.598615,0.744405,0.719425,...,0.421170,0.491108,0.490084,0.522450,0.587716,0.638864,0.457900,0.429135,0.604775,0.522229
Ronnie Brown,0.476609,1.000000,0.551036,0.593315,0.614389,0.628726,0.559088,0.488225,0.698091,0.634652,...,0.360399,0.423745,0.409443,0.463884,0.538701,0.505686,0.472633,0.465313,0.509646,0.465220
Braylon Edwards,0.565490,0.551036,1.000000,0.582401,0.509050,0.583387,0.636203,0.454950,0.610896,0.543281,...,0.378306,0.386349,0.451020,0.472596,0.479897,0.532337,0.419584,0.344625,0.455690,0.412015
Cedric Benson,0.630389,0.593315,0.582401,1.000000,0.734261,0.683382,0.639787,0.560410,0.727590,0.663984,...,0.391923,0.612055,0.449053,0.597404,0.592725,0.624510,0.674014,0.512724,0.649701,0.675816
Carnell Williams,0.693408,0.614389,0.509050,0.734261,1.000000,0.665222,0.682911,0.585587,0.755781,0.746248,...,0.458924,0.545578,0.477142,0.591990,0.642036,0.664936,0.623411,0.530311,0.721177,0.684113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jermar Jefferson,0.638864,0.505686,0.532337,0.624510,0.664936,0.758730,0.537334,0.617805,0.679013,0.593479,...,0.579235,0.579929,0.543289,0.687124,0.732827,1.000000,0.675035,0.551464,0.654383,0.673405
Dax Milne,0.457900,0.472633,0.419584,0.674014,0.623411,0.565948,0.494793,0.489192,0.572799,0.572062,...,0.387680,0.676194,0.387692,0.696208,0.694720,0.675035,1.000000,0.549212,0.664359,0.836526
Grant Stuard,0.429135,0.465313,0.344625,0.512724,0.530311,0.531820,0.357236,0.455455,0.495746,0.471953,...,0.291216,0.520116,0.365038,0.508498,0.522695,0.551464,0.549212,1.000000,0.518708,0.554709
Robert Jones,0.604775,0.509646,0.455690,0.649701,0.721177,0.716075,0.556046,0.586502,0.714940,0.678378,...,0.495151,0.720294,0.509421,0.662996,0.679584,0.654383,0.664359,0.518708,1.000000,0.670501


In [86]:
similarity_df.sort_values(by = 'Joe Burrow', ascending = False).head(10)


player_name,Alex Smith,Ronnie Brown,Braylon Edwards,Cedric Benson,Carnell Williams,Adam Jones,Troy Williamson,Antrel Rolle,Carlos Rogers,Mike Williams,...,Chris Garrett,Marquiss Spencer,Pressley Harvin III,Kawaan Baker,Kylin Hill,Jermar Jefferson,Dax Milne,Grant Stuard,Robert Jones,Tim Jones
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Joe Burrow,0.667249,0.547796,0.561737,0.642604,0.612085,0.721309,0.553823,0.599182,0.695061,0.665384,...,0.489352,0.654132,0.530777,0.719868,0.691546,0.755297,0.671833,0.536448,0.642857,0.666317
Ian Book,0.680205,0.543125,0.527824,0.668381,0.643171,0.712577,0.529315,0.568315,0.710164,0.666234,...,0.491203,0.63878,0.565071,0.639689,0.728421,0.760724,0.666652,0.550626,0.665329,0.664741
Trevor Lawrence,0.66861,0.523412,0.534969,0.724195,0.665337,0.717701,0.568105,0.555533,0.69209,0.688021,...,0.436925,0.657664,0.559533,0.680982,0.687732,0.738541,0.700037,0.570904,0.660303,0.735086
Nate Stanley,0.737348,0.61402,0.576452,0.733018,0.699328,0.768714,0.603242,0.599311,0.812583,0.7539,...,0.466411,0.657885,0.562509,0.707718,0.744438,0.75313,0.653062,0.577452,0.718572,0.649098
Drew Lock,0.751464,0.529165,0.54107,0.675271,0.710035,0.729464,0.58926,0.610645,0.74278,0.700076,...,0.439508,0.597645,0.543229,0.690778,0.693732,0.74052,0.620399,0.497,0.686752,0.669159
D'Andre Walker,0.65227,0.594286,0.535224,0.739839,0.694255,0.746739,0.583279,0.586571,0.731138,0.719525,...,0.505675,0.699327,0.487579,0.746534,0.71208,0.743004,0.725877,0.597437,0.737387,0.748483
Dwayne Haskins,0.695432,0.665463,0.587154,0.737369,0.719417,0.741255,0.625397,0.593705,0.81471,0.751444,...,0.473878,0.626599,0.52341,0.733306,0.755877,0.740248,0.719339,0.558509,0.69962,0.686708
Zach Wilson,0.689203,0.56129,0.55092,0.694037,0.632014,0.709378,0.577698,0.577374,0.720028,0.681847,...,0.400552,0.576452,0.483698,0.636901,0.656047,0.68604,0.628075,0.514435,0.627447,0.634339
Davis Mills,0.676303,0.594022,0.542123,0.710739,0.671732,0.714208,0.585345,0.578435,0.769109,0.678331,...,0.429059,0.605796,0.487371,0.660928,0.6847,0.728609,0.648362,0.51394,0.671726,0.629416
Mac Jones,0.596059,0.572746,0.533829,0.709283,0.704434,0.707584,0.587688,0.560867,0.669652,0.674521,...,0.522761,0.684914,0.537144,0.749083,0.755452,0.774633,0.780992,0.565849,0.728592,0.776568


In [87]:
similarity_df

player_name,Alex Smith,Ronnie Brown,Braylon Edwards,Cedric Benson,Carnell Williams,Adam Jones,Troy Williamson,Antrel Rolle,Carlos Rogers,Mike Williams,...,Chris Garrett,Marquiss Spencer,Pressley Harvin III,Kawaan Baker,Kylin Hill,Jermar Jefferson,Dax Milne,Grant Stuard,Robert Jones,Tim Jones
player_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alex Smith,1.000000,0.476609,0.565490,0.630389,0.693408,0.665695,0.635621,0.598615,0.744405,0.719425,...,0.421170,0.491108,0.490084,0.522450,0.587716,0.638864,0.457900,0.429135,0.604775,0.522229
Ronnie Brown,0.476609,1.000000,0.551036,0.593315,0.614389,0.628726,0.559088,0.488225,0.698091,0.634652,...,0.360399,0.423745,0.409443,0.463884,0.538701,0.505686,0.472633,0.465313,0.509646,0.465220
Braylon Edwards,0.565490,0.551036,1.000000,0.582401,0.509050,0.583387,0.636203,0.454950,0.610896,0.543281,...,0.378306,0.386349,0.451020,0.472596,0.479897,0.532337,0.419584,0.344625,0.455690,0.412015
Cedric Benson,0.630389,0.593315,0.582401,1.000000,0.734261,0.683382,0.639787,0.560410,0.727590,0.663984,...,0.391923,0.612055,0.449053,0.597404,0.592725,0.624510,0.674014,0.512724,0.649701,0.675816
Carnell Williams,0.693408,0.614389,0.509050,0.734261,1.000000,0.665222,0.682911,0.585587,0.755781,0.746248,...,0.458924,0.545578,0.477142,0.591990,0.642036,0.664936,0.623411,0.530311,0.721177,0.684113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jermar Jefferson,0.638864,0.505686,0.532337,0.624510,0.664936,0.758730,0.537334,0.617805,0.679013,0.593479,...,0.579235,0.579929,0.543289,0.687124,0.732827,1.000000,0.675035,0.551464,0.654383,0.673405
Dax Milne,0.457900,0.472633,0.419584,0.674014,0.623411,0.565948,0.494793,0.489192,0.572799,0.572062,...,0.387680,0.676194,0.387692,0.696208,0.694720,0.675035,1.000000,0.549212,0.664359,0.836526
Grant Stuard,0.429135,0.465313,0.344625,0.512724,0.530311,0.531820,0.357236,0.455455,0.495746,0.471953,...,0.291216,0.520116,0.365038,0.508498,0.522695,0.551464,0.549212,1.000000,0.518708,0.554709
Robert Jones,0.604775,0.509646,0.455690,0.649701,0.721177,0.716075,0.556046,0.586502,0.714940,0.678378,...,0.495151,0.720294,0.509421,0.662996,0.679584,0.654383,0.664359,0.518708,1.000000,0.670501


In [88]:
joe_shiesty = similarity_df.loc['Joe Burrow']

In [89]:
joe_shiesty.sort_values(ascending = False).head(20)

Unnamed: 0_level_0,Joe Burrow
player_name,Unnamed: 1_level_1
Joe Burrow,1.0
Ian Book,0.876868
Trevor Lawrence,0.863096
Nate Stanley,0.860192
Drew Lock,0.85239
D'Andre Walker,0.845148
Dwayne Haskins,0.843203
Zach Wilson,0.831548
Davis Mills,0.829257
Mac Jones,0.827918


In [90]:
# generate the three most similar players to Joe Burrow, making sure they match his position and were drafted in the same round

