In [1]:
import os
import pandas as pd

os.chdir('/Users/jacksonwalters/tensorflow_datasets')
filepath_dict = {'tweet':   'labeled_tweets/labeled_tweets.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    If we have any hope of ending this chaos, we h...
label                                                       0
source                                                  tweet
Name: 0, dtype: object


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [3]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
#slice the df to get yelp sentences
df_tweet = df[df['source'] == 'tweet']
sentences = df_tweet['sentence'].values
#tweet sentiment labels. 0 = negative, 1 = positive
y = df_tweet['label'].values
#split the sentences into training data and test data
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
#vectorize the training and test data by word counting
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

CountVectorizer()

In [4]:
#vectorized training data
X_train = vectorizer.transform(sentences_train); X_train

<1500x5714 sparse matrix of type '<class 'numpy.int64'>'
	with 36890 stored elements in Compressed Sparse Row format>

In [5]:
#vectorized test data
X_test  = vectorizer.transform(sentences_test); X_test

<500x5714 sparse matrix of type '<class 'numpy.int64'>'
	with 11236 stored elements in Compressed Sparse Row format>

In [6]:
#perform a logistic regression on the training data and measure it against test data
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.976


In [7]:
#build a logistic regression model for each data source and test accuracy
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for tweet data: 0.9760


In [8]:
test_str = "Joe Biden"
x_test_str = vectorizer.transform([test_str])
classifier.predict(x_test_str)

array([1])