In [1]:
import codecs, re, json, os, time
from pyspark import SparkContext, SparkConf
from pyspark.mllib.fpm import FPGrowth
from pyspark.sql import SQLContext, Row
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer, IDF, StopWordsRemover

<h1>Create Spark and SQL context:</h1>

In [2]:
conf = SparkConf().setAppName("Text Classifier")
# if not sc:
sc = SparkContext(conf=conf)

In [3]:
sqlContext = SQLContext(sc)

<h1>Load Configuratoin File</h1>

In [4]:
def load_config(config_file):
    """
    Load collection configuration file.
    """
    with open(config_file) as data_file:    
        config_data = json.load(data_file)
    return config_data

<h1>Parse Tweets to tweet_id and tweet_text</h1>

In [5]:
def parse_tweet(line):
    """
    Parses a tweet record having the following format collectionId-tweetId<\t>tweetString
    """
    fields = line.strip().split("\t")
    if len(fields) == 2:
        # The following regex just strips of an URL (not just http), any punctuations, 
        # User Names or Any non alphanumeric characters
        # http://goo.gl/J8ZxDT
        text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",fields[1]).strip()
        # remove terms <= 2 characters
        text = ' '.join(filter(lambda x: len(x) > 2, text.split(" ")))
        # return tuple of (collectionId-tweetId, text)
        return (fields[0], text)

<h1>Load tweets from file into DataFrame:</h1>

In [6]:
def Load_tweets(collection_id):
    tweets_file = os.path.join(base_dir , data_dir , "z_" + collection_id)
    print("Loading " + tweets_file) 
    if not os.path.isdir(tweets_file):
        print(tweets_file + " folder doesn't exist.")
        return False
    tweets = sc.textFile(tweets_file) \
              .map(parse_tweet) \
              .filter(lambda x: x is not None) \
              .map(lambda x: Row(id=x[0], text=x[1])) \
              .toDF() \
              .cache()
    return tweets

<h1>Tokenize and remove stop words from tweet text:</h1>

In [7]:
def preprocess_tweets(tweets):
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tweets = tokenizer.transform(tweets)
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    tweets = remover.transform(tweets)
    return tweets

<h1>Save unique tokens:</h1>

In [8]:
def save_unique_token(tweets):
    tweets = (tweets
      .rdd
      .map(lambda x : (x.id, x.text, list(set(filter(None, x.filtered)))))
      .toDF()
      .withColumnRenamed("_1","id")
      .withColumnRenamed("_2","text")
      .withColumnRenamed("_3","filtered")).cache()
    return tweets

<h2>Run Frequent Pattern Mining algorithm and save to output file:</h2>

In [9]:
def run_FPM(tweets, collection):
    model = FPGrowth.train(tweets.select("filtered").rdd.map(lambda x: x[0]), minSupport=0.02)
    result = sorted(model.freqItemsets().collect(), reverse=True)
    # sort the result in reverse order
    sorted_result = sorted(result, key=lambda item: int(item.freq), reverse=True)

    # save output to file
    with codecs.open(FP_dir + time.strftime("%Y%m%d-%H%M%S") + '_'
                            + collection["Id"] + '_' 
                            + collection["name"] + '.txt', 'w',encoding='utf-8') as file:
        for item in sorted_result:
            file.write("%s %s\n" % (item.freq, ' '.join(item.items)))

<h1>Global Variables</h1>

In [10]:
base_dir = "/home/hshahin/Spring2016_IR_Project/data/"
data_dir = "small_data"
predictions_dir = os.path.join(base_dir , data_dir, "predictions")
FP_dir = base_dir + "FPGrowth/"
config_file = "collections_config.json"
config_data = load_config(os.path.join(base_dir , config_file))

<h1>Phase I: Run FPM to all data sets</h1>

In [11]:
for x in config_data["collections"]:
    tweets = Load_tweets(x["Id"])
    if tweets:
        tweets = preprocess_tweets(tweets)
        tweets = save_unique_token(tweets)
        run_FPM(tweets, x)

Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_602
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_541
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_668
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_700
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_686
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_694
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_532
/home/hshahin/Spring2016_IR_Project/data/small_data/z_532 folder doesn't exist.


<h1>Manually choose frequent patterns and write them in the configuration file.</h1>
<h1>Reload the configuration file:</h1>

In [12]:
# get FP from config file
config_data = load_config(os.path.join(base_dir , config_file))

<h1>Create training data of positive and negative samples as DataFrame</h1>

In [13]:
def create_training_data(tweets, freq_patterns):
    # Tweets contains the frequent pattern terms will be considered as positive samples
    positive_tweets = (tweets
      .rdd
      .filter(lambda x: set(freq_patterns).issubset(x.filtered))
      .map(lambda x : (x[0], x[1], x[2], 1.0))
      .toDF()
      .withColumnRenamed("_1","id")
      .withColumnRenamed("_2","text")
      .withColumnRenamed("_3","filtered")
      .withColumnRenamed("_4","label"))

    # calculate a fraction of positive samples to extract equivalent number of negative samples
    positive_fraction = float(positive_tweets.count()) / tweets.count()

    # Negative samples will be randomly selected from non_positive samples
    negative_tweets = (tweets
      .rdd
      .filter(lambda x: not set(freq_patterns).issubset(x[2]))                   
      .sample(False, positive_fraction, 12345)
      .map(lambda x : (x[0], x[1], x[2], 0.0))
      .toDF()
      .withColumnRenamed("_1","id")
      .withColumnRenamed("_2","text")
      .withColumnRenamed("_3","filtered")
      .withColumnRenamed("_4","label"))
    training_data = positive_tweets.unionAll(negative_tweets)
    return training_data

<h1> Train LogisticRegression Classifier:</h1>

In [14]:
def train_lg(training_data):
    # Configure an ML pipeline, which consists of the following stages: hashingTF, idf, and lr.
    hashingTF = HashingTF(inputCol="filtered", outputCol="TF_features")
    idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
    pipeline1 = Pipeline(stages=[hashingTF, idf])

    # Fit the pipeline1 to training documents.
    model1 = pipeline1.fit(training_data)

    # TODO: more hyperparameter tuning is required
    lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
    pipeline2 = Pipeline(stages=[model1, lr])

    # Fit the pipeline2 to training documents.
    model2 = pipeline2.fit(training_data)
    return model2

<h1>Evaluating LogisticRegression model on training data:</h1>

In [15]:
def get_training_score_lg(lg_model, training_data):
    training_prediction = lg_model.transform(training_data)
    selected = training_prediction.select("label", "prediction").rdd.map(lambda x: (x[0], x[1]))
    training_error = selected.filter(lambda (label, prediction): label != prediction).count() / float(tweets.count())
    print("Training Error = " + str(training_error))

<h1>Prepare testing data:</h1>

In [16]:
def create_testing_data(tweets):
    testing_data = (tweets
                    .rdd
                    .map(lambda x: Row(id=x[0], filtered=x[2]))
                    .toDF())
    return testing_data

In [17]:
def lg_prediction(lg_model, testing_data, collection):
    # Perfom predictions on test documents and save columns of interest to a file.
    prediction = lg_model.transform(testing_data)
    selected = prediction.select("id", "prediction")
    prediction_path = os.path.join(predictions_dir , time.strftime("%Y%m%d-%H%M%S") + '_'
                            + collection["Id"] + '_' 
                            + collection["name"])
    print(prediction_path)
    def saveData(data):
        with open(prediction_path, 'a') as f:
            f.write(data.id+"\t"+str(data.prediction)+"\n")
    selected.foreach(saveData)    
    # selected.rdd.saveAsTextFile(prediction_path)

<h1>Phase II: Train classifier and perform prediction</h1>

In [18]:
for x in config_data["collections"]:
    tweets = Load_tweets(x["Id"])
    if tweets:
        freq_patterns = x["FP"]
        tweets = preprocess_tweets(tweets)
        tweets = save_unique_token(tweets)
        training_data = create_training_data(tweets, freq_patterns)
        lg_model = train_lg(training_data)
        get_training_score_lg(lg_model, training_data)
        testing_data = create_testing_data(tweets)
        lg_prediction(lg_model, testing_data, x)

Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_602
Training Error = 7.55988508975e-05
/home/hshahin/Spring2016_IR_Project/data/small_data/predictions/20160331-171230_602_Germanwings
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_541
Training Error = 5.90510496324e-05
/home/hshahin/Spring2016_IR_Project/data/small_data/predictions/20160331-171248_541_NAACPBombing
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_668
Training Error = 0.0
/home/hshahin/Spring2016_IR_Project/data/small_data/predictions/20160331-171301_668_houstonflood
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_700
Training Error = 0.0
/home/hshahin/Spring2016_IR_Project/data/small_data/predictions/20160331-171313_700_wdbj7 shooting 
Loading /home/hshahin/Spring2016_IR_Project/data/small_data/z_686
Training Error = 0.0
/home/hshahin/Spring2016_IR_Project/data/small_data/predictions/20160331-171350_686_Obamacare
Loading /home/hshahin/Spring2016_IR_Project/data/smal



<h1>prediction DataFrame contians "prediction" column to be filled in Hbase</h1>

In [None]:
# Make predictions on test documents and print columns of interest.
prediction = model2.transform(testing_data)
selected = prediction.select("id", "prediction")
def saveData(data):
    with open(base_dir+'FPGrowth/700_classified.txt', 'a') as f:
        f.write(data.id+"\t"+str(data.prediction)+"\n")
selected.foreach(saveData)