V1 of small-scale discovery pipeline for twitter:
- Predict if results of twitter searches are relevant (twitter specific training set)
- Scrape relevant result URLs 
- Predict if URLs are relevant (musoW specific training set)
- Return results in a table w/ all necessary info + prediction info (score etc)
- Options to do additional filtering at result stages: filter for specific keywords, filter for language, filter for unwanted URLs

TO DO:
- Export twitter training to pkl to load faster
- Add sites to blacklist before scraping
- Add values to remove after scraping e.g 404 
- Integrate twitter search

Step 1:
- Load libraries and functions 

In [1]:
#imports + path
from __future__ import print_function
import requests
import pandas as pd
import pickle
import numpy as np
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import classification_report 
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from bs4 import BeautifulSoup
pd.set_option('display.max_rows', 100)
path = '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/'

In [18]:
#Log Reg crossval function to check training tests 
def lr(x, y, cv_value, scoring_value, title):  
    """ logistic regression"""
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_value, scoring=scoring_value)
    #model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
    y_pred = cross_val_predict(model, x, y, cv=cv_value)
    acc = cross_val_score(model, x, y, cv=cv_value, scoring=scoring_value)    
    report = classification_report(y, y_pred)
    return print(f'{title}\n''MEAN PRECISION', np.mean(acc), 'report:', report, sep='\n')

In [2]:
#PREDICTION FUNCTION W/ and W/O LOGREGCV 

def lr_model_predict_cv(t_input, t_feature, target, cv_int, score_type, p_input, p_feature, filename, path):
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    x_count = count_vect.fit_transform(t_input[t_feature])
    x_train = tfidf_transformer.fit_transform(x_count)
    y_train = t_input[target].values
    model = LogisticRegressionCV(solver='liblinear', random_state=44, cv=cv_int, scoring=score_type)
    model.fit(x_train, y_train)
    export = f'LOGREG_RELEVANCE/{filename}.sav'
    pickle.dump(model, open(path+export, 'wb'))
    x_new_count = count_vect.transform(p_input[p_feature])
    x_new_train = tfidf_transformer.transform(x_new_count)
    y_predict = model.predict(x_new_train)
    scores = model.decision_function(x_new_train)
    probability = model.predict_log_proba(x_new_train)
    results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = results
    result['Score'] = [s for s in scores]
    result['Probability'] = [p for p in probability]
    result['Input Length'] = result[p_feature].str.len()
    return result

def lr_model_predict(t_input, t_feature, target, p_input, p_feature, filename, path):
    count_vect = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    x_count = count_vect.fit_transform(t_input[t_feature])
    x_train = tfidf_transformer.fit_transform(x_count)
    y_train = t_input[target].values
    model = LogisticRegression(solver='liblinear', C=10.0,random_state=44)
    model.fit(x_train, y_train)
    export = f'LOGREG_RELEVANCE/{filename}.sav'
    pickle.dump(model, open(path+export, 'wb'))
    x_new_count = count_vect.transform(p_input[p_feature])
    x_new_train = tfidf_transformer.transform(x_new_count)
    y_predict = model.predict(x_new_train)
    scores = model.decision_function(x_new_train)
    probability = model.predict_log_proba(x_new_train)
    results = [r for r in y_predict]
    result = p_input.copy()
    result['Prediction'] = results
    result['Score'] = [s for s in scores]
    result['Probability'] = [p for p in probability]
    result['Input Length'] = result[p_feature].str.len()
    return result

In [3]:
#scrape URLs for title, desc and URL text 

def scrape_links(link_list):
    links = pd.DataFrame(columns=['Title', 'Description', 'URL'])
    for link in link_list:
        URL = link
        try:
            page = requests.get(URL)
        except requests.exceptions.ConnectionError:
            pass
        except Exception:
            continue
        soup = BeautifulSoup(page.content, "html.parser")
        if soup and soup.find('head') and soup.find('body') is not None:
            title = ' '.join([t.text for t in soup.find('head').find_all('title')]).strip()
            text = ' '.join([p.text for p in soup.find('body').find_all('p')]).strip()
            new_row = {'Title': title, 'Description': text, 'URL': URL.strip()}
            links = links.append(new_row, ignore_index=True)
    return links

Step 2:
- load training sets

In [4]:
#description training set 
training_set_even_adds = pd.read_pickle(path+'LOGREG_RELEVANCE/trainingset_even_extended.pkl')
new_training_set = pd.read_pickle(path+'new_training_set.pkl')

In [5]:
#negative twitter training set
#music_culture = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_music_culture.pkl')
#music_history = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_music_history.pkl')
#music_magazine = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_music_magazine.pkl')
#music_oral_h = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_music_oral_history.pkl')
#music_research = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_music_research.pkl')
#sound_archive = pd.read_pickle(path+'TWITTER_SEARCHES/MJI BIGRAMS (NEGATIVE)/twitter_sound_archive.pkl')
dh = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/digital_humanities_2021.pkl')
music_company = pd.read_pickle(path+'TWITTER_SEARCHES/NEGATIVE/music_company_2021.pkl')
twitter_neg = pd.concat([dh, music_company])
twitter_neg['Target'] = '0'
twitter_neg = twitter_neg[['tweet', 'Target']].reset_index(drop=True)

In [6]:
#positive twitter training set 
midi_file = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS (POSITIVE)/twitter_midi_file.pkl')
music_collection = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS (POSITIVE)/twitter_music_collection.pkl')
music_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS (POSITIVE)/twitter_music_dataset.pkl')
music_data = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS (POSITIVE)/twitter_music_data.pkl')
music_library = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS (POSITIVE)/twitter_music_library.pkl')
sheet_music = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS (POSITIVE)/twitter_sheet_music.pkl')
song_dataset = pd.read_pickle(path+'TWITTER_SEARCHES/MUSOW BIGRAMS (POSITIVE)/twitter_song_dataset.pkl')
twitter_pos = pd.concat([midi_file, music_collection, music_dataset, music_data, music_library, sheet_music, song_dataset])
twitter_pos['Target'] = '1'
twitter_pos = twitter_pos[['tweet', 'Target']].reset_index(drop=True)
twitter_pos = twitter_pos.sample(n=7327, random_state=56)

In [7]:
#final twitter training set
twitter_set = pd.concat([twitter_pos, twitter_neg])
twitter_set['Target'] = twitter_set['Target'].astype('int')
twitter_set = twitter_set.reset_index(drop=True)

Step 3 (optional):
- test logreg crossval score of training sets

In [22]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

#tfidf encode
base_train_counts = count_vect.fit_transform(new_training_set['Description'])
training_set_tfidf = tfidf_transformer.fit_transform(base_train_counts)
x_tfidf = training_set_tfidf
y_tfidf = new_training_set['Target'].values

In [None]:
lr(x_tfidf, y_tfidf, 10, 'precision', 'Test')

Twitter training set scores 0.80-0.97 w/ precision and cv values 2, 5, 10. 
Archive training set scores around 0.65 w/ LogReg CV but around 0.80 w/ normal LogReg, neg precision is low w/ LogRegCV 
New training set scores 0.97-0.98 on LogRegCV

Step 4:
- Load prediction sets
- Filter by language (eng)

In [8]:
prediction_twitter = pd.read_pickle(path+'TWITTER_SEARCHES/PREDICTIONS/digital_archive_22.pkl')
prediction_twitter = prediction_twitter.loc[prediction_twitter['lang'] == 'en']

In [27]:
#check length 
len(prediction_twitter)

963

Step 5:
- run the predict function against chosen sets (twitter)
- filter the results by prediction score (only positives) and optionally by inclusion of the 'music' kw
- return a df w/ tweet, prediction value, score, probability, length of input and url 

In [9]:
#variable for removing unwanted results 
discard = ['youtu', '404', 'Not Found']

In [10]:
#run w/ LogRegCV
tweet_predict_cv = lr_model_predict_cv(twitter_set, 'tweet', 'Target', 5, 'precision', prediction_twitter, 'tweet', 'twitter_test_cv', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [11]:
#filter and display results 
tweet_predict_cv_df = tweet_predict_cv.copy()
tweet_predict_cv_df = tweet_predict_cv_df.loc[tweet_predict_cv_df['Prediction'] == 1]
tweet_predict_cv_df = tweet_predict_cv_df[~tweet_predict_cv_df.url.str.contains('|'.join(discard))]
tweet_predict_cv_df = tweet_predict_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
tweet_predict_cv_df = tweet_predict_cv_df[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'url']]
tweet_predict_cv_df

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,As a new semester gets underway and Packard Li...,1,2.49724,"[-2.576339159548708, -0.07909938741888302]",245,http://www.ccad.edu/blogs/ccad-launches-new-di...
1,"ICYMI, the 'Glory Years Collection' is now ava...",1,2.260489,"[-2.3597000875044882, -0.09921117858834196]",218,https://drive.google.com/drive/folders/1CRdzXB...
2,It's so annoying. Click in on the Digital Lib...,1,2.03283,"[-2.155901131355724, -0.12307063325970813]",164,https://www.torontopubliclibrary.ca/search.jsp...
3,"Brewster Cale's free digital archive has 70,00...",1,1.553153,"[-1.7450781153859058, -0.19192514038208824]",279,https://archive.org
4,Ever wondered what happens during the document...,1,1.113837,"[-1.3977342165203548, -0.28389768592889597]",217,https://www.storetec.net/resources/blog/the-do...
5,@Greatbert @YesHunter @sprucehen_ https://t.co...,1,1.056349,"[-1.354765623194988, -0.29841649193405434]",90,"https://vilda.alaska.edu, https://miaclab.org/..."
6,My Gumroad's been updated with a pack of all m...,1,0.924603,"[-1.2587073224145946, -0.3341043537885582]",284,https://akairiot.gumroad.com
7,Trying to dig my way out of my email -- @activ...,1,0.812087,"[-1.1794556982779303, -0.3673691182871412]",148,https://www.bogalusadailynews.com/2022/01/18/h...
8,Indianapolis Public Library: Indianapolis Publ...,1,0.794205,"[-1.1671060536125537, -0.3729007699190078]",151,https://patch.com/indiana/indianapolis/indiana...
9,A digital archive dedicated to life on Earth. ...,1,0.792028,"[-1.1656067231206384, -0.37357911598382626]",226,https://www.thisiscolossal.com/2020/01/biodive...


In [12]:
#optional filter by kw
substring = 'music'
mask = tweet_predict_cv_df.applymap(lambda x: substring in x.lower() if isinstance(x,str) else False).to_numpy()
tweet_predict_cv_df_kw = tweet_predict_cv_df.loc[mask] 
tweet_predict_cv_df_kw

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
15,https://t.co/RhyGUCu2Kv: hear songs by Finch i...,1,0.649459,"[-1.0696996078424883, -0.4202410922158521]",154,https://tinyurl.com/Anne-Finch-Digital-Archive


In [13]:
#run w/ LogReg
tweet_predict = lr_model_predict(twitter_set, 'tweet', 'Target', prediction_twitter, 'tweet', 'twitter_test', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [14]:
#filter and display results 
tweet_predict_df = tweet_predict.copy()
tweet_predict_df = tweet_predict.loc[tweet_predict['Prediction'] == 1]
tweet_predict_df = tweet_predict_df[~tweet_predict_df.url.str.contains('|'.join(discard))]
tweet_predict_df = tweet_predict_df.sort_values(by='Score', ascending=False).reset_index(drop=True)
tweet_predict_df = tweet_predict_df[['tweet', 'Prediction', 'Score', 'Probability', 'Input Length', 'url']]
tweet_predict_df

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
0,As a new semester gets underway and Packard Li...,1,3.813256,"[-3.835091622386986, -0.021836039861482318]",245,http://www.ccad.edu/blogs/ccad-launches-new-di...
1,"ICYMI, the 'Glory Years Collection' is now ava...",1,2.991122,"[-3.0401318786743188, -0.04901019895906712]",218,https://drive.google.com/drive/folders/1CRdzXB...
2,It's so annoying. Click in on the Digital Lib...,1,2.745373,"[-2.8076196753722638, -0.062246185760940126]",164,https://www.torontopubliclibrary.ca/search.jsp...
3,"Brewster Cale's free digital archive has 70,00...",1,2.100739,"[-2.2161776820047168, -0.11543895716294954]",279,https://archive.org
4,@Greatbert @YesHunter @sprucehen_ https://t.co...,1,1.5334,"[-1.7288029485248457, -0.19540285179516134]",90,"https://vilda.alaska.edu, https://miaclab.org/..."
5,Ever wondered what happens during the document...,1,1.481689,"[-1.6864678947202014, -0.20477874486566575]",217,https://www.storetec.net/resources/blog/the-do...
6,Indianapolis Public Library: Indianapolis Publ...,1,1.271503,"[-1.5186831665223952, -0.24718023643936568]",151,https://patch.com/indiana/indianapolis/indiana...
7,A Digital Archive of Soviet Children’s Books G...,1,1.0705,"[-1.3652849242113754, -0.2947844644522677]",154,https://www.openculture.com/2017/07/a-digital-...
8,Did you know about the Des Moines Register dig...,1,1.000363,"[-1.3135268237352378, -0.31316416703945665]",236,https://buff.ly/3oqLORQ
9,"Back in 1994, Betty Boothroyd was appointed th...",1,0.993132,"[-1.3082453124293276, -0.3151134586650556]",288,http://ow.ly/nhBo50IcCIi


In [33]:
#optional filter by kw
substring = 'music'
mask = tweet_predict_df.applymap(lambda x: substring in x.lower() if isinstance(x,str) else False).to_numpy()
tweet_predict_df_kw = tweet_predict_df.loc[mask] 
tweet_predict_df_kw

Unnamed: 0,tweet,Prediction,Score,Probability,Input Length,url
11,https://t.co/RhyGUCu2Kv: hear songs by Finch i...,1,0.947396,"[-1.2750795021026953, -0.3276832286116716]",154,https://tinyurl.com/Anne-Finch-Digital-Archive


Step 6:
- grab links from twitter predictions and scrape them for text 
- return a new table that can be used as prediction against the musow training set 

In [15]:
#URLs to list 
twitter_link_list_cv = [link for link in tweet_predict_cv_df['url'] if 'twitter' not in link]
twitter_link_list = [link for link in tweet_predict_df['url'] if 'twitter' not in link]

In [17]:
#scrape URL list
links_to_add_cv = scrape_links(twitter_link_list_cv)
#links_to_add = scrape_links(twitter_link_list)

KeyboardInterrupt: 

In [26]:
#remove empty descriptions 
links_to_add_cv = links_to_add_cv[links_to_add_cv.Description != ''].reset_index(drop=True)
links_to_add = links_to_add[links_to_add.Description != ''].reset_index(drop=True)

Step 7:
- run the predict function against chosen sets (musoW)

In [32]:
#run with LogRegCV 
twitter_preds_cv = lr_model_predict_cv(new_training_set, 'Description', 'Target', 10, 'precision', links_to_add_cv, 'Description', 'extended_even_model_cv_twitter', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [44]:
#filter results by positive value and score
twitter_preds_cv_df = twitter_preds_cv.copy()
twitter_preds_cv_df = twitter_preds_cv_df.loc[twitter_preds_cv_df['Prediction'] == 1]
twitter_preds_cv_df = twitter_preds_cv_df[~twitter_preds_cv_df.Title.str.contains('|'.join(discard))]
twitter_preds_cv_df.sort_values(by='Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Title,Description,URL,Prediction,Score,Probability,Input Length


In [34]:
#run with LogReg 
twitter_preds = lr_model_predict(new_training_set, 'Description', 'Target', links_to_add, 'Description', 'extended_even_model_twitter', '/Users/laurentfintoni/Desktop/University/COURSE DOCS/THESIS/Internship/musow-pipeline/')

In [46]:
#filter results by positive value and score
twitter_preds_df = twitter_preds.copy()
twitter_preds_df = twitter_preds_df.loc[twitter_preds_df['Prediction'] == 1]
twitter_preds_df = twitter_preds_df[~twitter_preds_df.Title.str.contains('|'.join(discard))]
twitter_preds_df.sort_values(by='Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Title,Description,URL,Prediction,Score,Probability,Input Length
0,"If Not Here, Then Where?",3D Assets,https://oncyber.io/if-not-here-then-where,1,3.22711,"[-3.2660150807696704, -0.03890527196295917]",9
1,Des Moines Register | Des Moines Public Library,Read full-text articles of today's Des Moines ...,https://buff.ly/3oqLORQ,1,2.9677,"[-3.0178426047078024, -0.050143025181493106]",210
2,"Cuttly | Free Custom URL Shortener, Branded UR...",Keep calm and shorten/manage long URLs with cu...,"https://cutt.ly/NO2xgZN, https://cutt.ly/AO2xYDx",1,2.611592,"[-2.6824391251950415, -0.07084758020804563]",147
3,The Palestinian Museum -Digital Archive - الار...,صورة صورة التقطت عام 1970 لحسن الخطيب مع طلاب ...,https://palarchive.org/index.php/Detail/object...,1,1.996386,"[-2.123745466214168, -0.12735950083268982]",331
4,The Palestinian Museum -Digital Archive - الار...,صورة صورة التقطت عام 1970 لحسن الخطيب مع طلاب ...,https://cutt.ly/qO2IDot,1,1.996386,"[-2.123745466214168, -0.12735950083268982]",331
5,Home | Search the archive | British Newspaper ...,Access hundreds of historic newspapers from al...,https://www.britishnewspaperarchive.co.uk/,1,1.862548,"[-2.00688738048501, -0.14433974172840525]",913
6,Soutron LMS - Search,Learn more \n\nRead the latest\n Learn More\n...,https://www.marshallfoundation.org/library/dig...,1,1.787162,"[-1.9419710429541304, -0.15480870871020935]",150
7,Soutron LMS - Search,Learn more \n\nRead the latest\n Learn More\n...,https://www.marshallfoundation.org/library/dig...,1,1.787162,"[-1.9419710429541304, -0.15480870871020935]",150
8,iDigOrion – Digital Archive | Orion Township P...,"Welcome to iDigOrion, a research tool funded b...",https://orionlibrary.org/idigorion/,1,1.24914,"[-1.501260406508946, -0.25212074232549736]",806
9,Home Page - Anne Finch Digital Archive,"Search options by Peter Cross, circa 1690 \r\n...",https://tinyurl.com/Anne-Finch-Digital-Archive,1,1.237588,"[-1.4922949267309136, -0.25470650538082834]",1186
