diff --git a/src/Feed.hs b/src/Feed.hs index 626ffc8..a3f71cf 100644 --- a/src/Feed.hs +++ b/src/Feed.hs @@ -5,11 +5,14 @@ module Feed ( , find , transact , wrap +, allFeeds , updateFeed , overflow , numEntries , lastUpdate , entries +, entryFromId +, Feed(Feed, _id, _url) ) where @@ -24,6 +27,7 @@ import Database.HDBC import Database.HDBC.PostgreSQL import Parse import Entry +import Viterbi @@ -90,12 +94,13 @@ assimilateFeed :: String -> IO () assimilateFeed url = urlEntries url >>= \es -> case es of - Nothing -> putStrLn "_ticket: No entries found" + Nothing -> putStrLn ">> No entries found." Just xs -> do con <- connectPostgreSQL connStr saveMstr con xs url mapM_ (save con url) xs disconnect con + putStrLn ">> Feed assimilated." ---------------------------------------------------------------------------------------------------- @@ -106,7 +111,7 @@ cyclicalUpdate :: IO () cyclicalUpdate = do fds <- allFeeds case fds of - Nothing -> putStrLn "> No feeds detected. Ensure database is initialized and functional" + Nothing -> putStrLn ">> No feeds detected. Ensure database is initialized and functional" Just xs -> mapM_ (updateFeed . _url) xs >> threadDelay 21600000000 >> cyclicalUpdate @@ -167,10 +172,10 @@ saveMstr con es url = do save :: Connection -> String -> Entry -> IO (Integer) save con url e = do - let a = "'" ++ (description e) ++ "'" + let a = "'" ++ (L.filter noQuot $ description e) ++ "'" b = "'" ++ (unsafePerformIO (feedFromUrl url >>= return . show . _id . fromJust)) ++ "'" c = "'" ++ (date e) ++ "'" - d = "'" ++ (title e) ++ "'" + d = "'" ++ (L.filter noQuot $ title e) ++ "'" f = "'" ++ (link e) ++ "'" z = concat $ L.intersperse ", " [a, b, c, d, f] sql <- return $ "insert into entries (content, feed_id, date, title, link) values (" ++ z ++ ");" @@ -276,7 +281,7 @@ lastUpdate i = wrap (lastUpdate' i) ---------------------------------------------------------------------------------------------------- --- -- given a feed id, return all associated entries field +-- given a feed id, return all associated entries field entries :: Int -> IO [Entry] entries i = wrap (entries' i) @@ -290,3 +295,25 @@ entries i = wrap (entries' i) ---------------------------------------------------------------------------------------------------- + +-- find an entry by its id + +entryFromId :: Int -> IO Entry +entryFromId id = wrap (efID id) >>= return . flip (!!) 0 + + where efID :: Int -> Connection -> IO [Entry] + efID id con = do + let sel = "select * from entries where id=" ++ (show id) ++ ";" + f x = fromJust . fromSql . (flip (!!) x) + rows <- quickQuery' con sel [] + return $ map (\e -> Entry {description=(f 1 e), date=(f 3 e), title=(f 4 e), link=(f 5 e)}) rows + + +---------------------------------------------------------------------------------------------------- + +-- remove single quotes + +noQuot :: Char -> Bool +noQuot c = not (c == '\'' || c == '`') + + diff --git a/src/Main.hs b/src/Main.hs index 1d42323..2737c6f 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -38,6 +38,10 @@ processUsrCommands = do "-fi" -> communicate "-c" -> putStrLn "" "-t" -> runAllTests + "-tag" -> runTagger + "-addFeed" -> addFeeds + "-precluster" -> precluster + "-nvis" -> runNounVisualization "-testTextSim" -> testTextSim "-testViterbi" -> testViterbi "-testNouns" -> testNouns @@ -56,11 +60,65 @@ printHelp = do putStrLn ">> -fi : Run this instance as a function server" putStrLn ">> -c : Run this instance as a corpus manager" putStrLn ">> -t : Run all test functions" + putStrLn ">> -tag : Tag a document" + putStrLn ">> -addFeed : Add feeds to the index manually" + putStrLn ">> -precluster : Perform some pre-cluster processing" + putStrLn ">> -nvis : Run noun visualizations" putStrLn ">> -testViterbi : Run tests of the viterbi algorithm" putStrLn ">> -testTextSim : Run tests of text similarity functions" putStrLn ">> -testNouns : Run tests of the noun clustering functions" + +runTagger :: IO () +runTagger = do + putStrLn ">> Training Viterbi, please wait..." + vit <- trainVit + putStrLn ">> Training Complete." + doTag vit + +doTag :: Vit -> IO () +doTag vit = do + putStrLn ">> Enter the id of the document you would like to tag." + input <- getLine + putStrLn (">> Will tag document with id " ++ input ++ ".") + entry <- entryFromId (read input) + res <- tag vit $ B.pack $ description entry + putStrLn ">> Tagged Document: \n\n" + putStrLn (show res) + putStrLn "\n\n>> Tag Another? y/n" + input <- getLine + case (input == "y") || (input == "Y") of + True -> doTag vit + False -> return () + + +addFeeds :: IO () +addFeeds = do + putStrLn ">> Please enter the feed URL" + input <- getLine + putStrLn (">> Adding feed data at " ++ input ++ ", please wait...") + assimilateFeed input + putStrLn ">> Would you like to add another? y/n" + input <- getLine + case input of + "y" -> addFeeds + "Y" -> addFeeds + _ -> return () + + +precluster :: IO () +precluster = do + putStrLn ">> -c : Entire database to graph" + input <- getLine + case input of + "-c" -> convertDatabase + _ -> return () + +runNounVisualization :: IO () +runNounVisualization = runCluster "user" + + runAllTests :: IO () runAllTests = do testTextSim diff --git a/src/Text/Nouns (Gideon Providence's conflicted copy 2012-01-15).hs b/src/Text/Nouns (Gideon Providence's conflicted copy 2012-01-15).hs new file mode 100644 index 0000000..f19889f --- /dev/null +++ b/src/Text/Nouns (Gideon Providence's conflicted copy 2012-01-15).hs @@ -0,0 +1,467 @@ +module Text.Nouns ( + initialPositioning +) where + + +import Data.Maybe +import qualified Data.List as L +import qualified Data.ByteString.Char8 as B +import qualified Data.HashTable.IO as M +import Control.Exception +import Control.Monad +import Database.HDBC +import Database.HDBC.PostgreSQL +import Feed +import Viterbi + + +---------------------------------------------------------------------------------------------------- + +-- typedefs + +type ByteString = B.ByteString +type Table k v = M.CuckooHashTable k v +type Chart = Table ByteString [(ByteString, Int)] + + +---------------------------------------------------------------------------------------------------- + +-- represents the generalized distance (number of separating words) between two nouns in a document + +data IndexDist = Less10 + | Less100 + | Other + + +---------------------------------------------------------------------------------------------------- + +-- represents a point in the cluster-space + +newtype Point2D = Point2D { coords :: (Float, Float) -- position of the point in a 2D cluster-space + } deriving Show + + +-- data type to facilitate cache of relationship strengths, +-- the vertex-origin of this relationship is implicit + +data Strength = Strength { target :: Int -- the other vertex on this edge + , strength :: Int -- measure of relationship strength + } deriving Show + + +-- data type to facilitate caching of relationships on each noun + +data RelationCache = RCache { related :: [Strength] + , unRelated :: [Int] + } deriving Show + + +newtype ForceFunc = ForceFunc { applyF :: (Int -> Float -> Float) } + + +---------------------------------------------------------------------------------------------------- + +-- Database connection parameters (Password altered for github) + +connStr :: String +connStr = "host=localhost dbname=ticket connect_timeout=7 port=5432 user=postgres password=password" + + +---------------------------------------------------------------------------------------------------- + +-- Database + +---------------------------------------------------------------------------------------------------- +---------------------------------------------------------------------------------------------------- + +-- Document-Graph Conversion + +---------------------------------------------------------------------------------------------------- + +-- + +documentsToGraph :: Vit -> [ByteString] -> IO () +documentsToGraph vit docs = mapM_ (documentToGraph vit) docs + + +---------------------------------------------------------------------------------------------------- + +-- + +documentToGraph :: Vit -> ByteString -> IO () +documentToGraph vit doc = tag vit doc >>= nounsAndIndices >>= integrate + + +---------------------------------------------------------------------------------------------------- + +-- + +integrate :: [(ByteString, Int)] -> IO () + +integrate [] = return () + +integrate [x] = return () + +integrate (x:xs) = mapM_ (integrateX x) xs >> integrate xs + + +---------------------------------------------------------------------------------------------------- + +-- + +integrateX :: (ByteString, Int) -> (ByteString, Int) -> IO () +integrateX (cur, i) (fut, fi) = + case cur == fut of + True -> return () + False -> case idxDist i fi of + Less10 -> pushScore 3 cur fut + Less100 -> pushScore 2 cur fut + Other -> pushScore 1 cur fut + + +---------------------------------------------------------------------------------------------------- + +-- + +idxDist :: Int -> Int -> IndexDist +idxDist a b + | (abs $ a - b) <= 10 = Less10 + | (abs $ a - b) <= 100 = Less100 + | otherwise = Other + + +---------------------------------------------------------------------------------------------------- + +-- + +pushScore :: Int -> ByteString -> ByteString -> IO () +pushScore i a b = do + relId <- ensureNounExists a >>= \aid -> ensureNounExists b >>= \bid -> ensureRelExists aid bid + incrementRelation relId i + + +---------------------------------------------------------------------------------------------------- + +-- + +incrementRelation :: Int -> Int -> IO () +incrementRelation relId sre = do + let sel = "select score from edge where id=" ++ (show relId) ++ ";" + upd x y = "update edge set score=" ++ (show $ x + y) ++ " where id=" ++ (show relId) ++ ";" + sql <- wrap (genericSingleLookup sel) >>= return . fromJust >>= return . (upd sre) + wrap (execIncrement sql) + return () + + where execIncrement :: String -> Connection -> IO Integer + execIncrement sql con = withTransaction con (\c -> prepare c sql >>= flip execute []) + + +---------------------------------------------------------------------------------------------------- + +-- + +ensureNounExists :: ByteString -> IO Int +ensureNounExists n = ensure (nounLookup n) (nounCreate n) + + +---------------------------------------------------------------------------------------------------- + +-- + +ensureRelExists :: Int -> Int -> IO Int +ensureRelExists vtxa vtxb = do + probe <- relLookup vtxa vtxb >>= \trya -> relLookup vtxb vtxa >>= \tryb -> return (trya, tryb) + case probe of + (Nothing, Nothing) -> relCreate vtxa vtxb + (Just id, Nothing) -> return id + (Nothing, Just id) -> return id + (Just _, Just _) -> putStrLn "> Error in #ensureRelExists" >> return (-1) + + +---------------------------------------------------------------------------------------------------- + +-- + +ensure :: IO (Maybe Int) -> IO Int -> IO Int +ensure lookupF createF = + lookupF >>= \mid -> + case mid of + Nothing -> join (evaluate createF) >>= return + Just x -> return x + + +---------------------------------------------------------------------------------------------------- + +-- + +nounLookup :: ByteString -> IO (Maybe Int) +nounLookup n = + let sel = "select id from vertex where noun='" ++ (B.unpack n) ++ "';" + in wrap (genericSingleLookup sel) + + +---------------------------------------------------------------------------------------------------- + +-- + +relLookup :: Int -> Int -> IO (Maybe Int) +relLookup vtxa vtxb = + let sel = "select id from edge where nodea=" ++ (show vtxa) ++ " and nodeb=" ++ (show vtxb) ++ ";" + in wrap (genericSingleLookup sel) + + +---------------------------------------------------------------------------------------------------- + +-- + +nounCreate :: ByteString -> IO Int +nounCreate n = + let ins = "insert into vertex (noun) values ('" ++ (B.unpack n) ++ "');" + sel = "select id from vertex where noun='" ++ (B.unpack n) ++ "';" + in wrap (genericCreateWid ins sel) + + +---------------------------------------------------------------------------------------------------- + +-- + +relCreate :: Int -> Int -> IO Int +relCreate vtxa vtxb = + let ins = "insert into edge (nodea, nodeb) values (" ++ (show vtxa) ++ ", " ++ (show vtxb) ++ ");" + sel = "select id from edge where nodea=" ++ (show vtxa) ++ " and nodeb=" ++ (show vtxb) ++ ";" + in wrap (genericCreateWid ins sel) + + +---------------------------------------------------------------------------------------------------- + +-- + +genericLookup :: String -> Connection -> IO (Maybe [Int]) +genericLookup sel con = + quickQuery' con sel [] >>= \rows -> + case rows of + [] -> return Nothing + _ -> return $ Just $ L.map (fromSql . (flip (!!) 0)) rows + + +---------------------------------------------------------------------------------------------------- + +-- + +genericSingleLookup :: String -> Connection -> IO (Maybe Int) +genericSingleLookup sel con = + genericLookup sel con >>= return . liftM (flip (!!) 0) + + +---------------------------------------------------------------------------------------------------- + +-- + +genericCreateWid :: String -> String -> Connection -> IO Int +genericCreateWid ins sel con = do + withTransaction con (\c -> prepare c ins >>= flip execute []) + rows <- quickQuery' con sel [] + return (L.map (fromSql . (flip (!!) 0)) rows) >>= return . fromJust . flip (!!) 0 + + + + +---------------------------------------------------------------------------------------------------- + +-- Clustering interface + +---------------------------------------------------------------------------------------------------- + + + +clusterCycle :: Table Int RelationCache -> Table Int Point2D -> ForceFunc -> Table Int Point2D +clusterCycle edges verts ffunc = do + newVerts <- M.new + M.mapM_ (reposition newVerts verts) edges + + +reposition :: Table Int Point2D -> Table Int Point2D -> (Int, RelationCache) -> IO () +reposition tbl verts (id, rcache) = do + L.map + + +charge :: Int -> ForceFunc -> Point2D -> Point2D -> Point2D +charge score (ForceFunc ff) (Point2D (xa, ya)) (Point2D (xb, yb)) = + let curDist = pythagoras (abs $ xa - xb) (abs $ ya - yb) + move = ff score curDist + in (xb - (move * (xa - xb)), yb - (move * (ya - yb))) + + where pythagoras :: Float -> Float -> Float + pythagoras a b = sqrt ((a^2) + (b^2)) + + +configureModForce :: (Float -> Float -> Float) -> Float -> Int -> ForceFunc +configureModForce f moveMax scoreMax = ForceFunc (modForce f moveMax scoreMax) + + +modForce :: (Float -> Float -> Float) -> Float -> Int -> Int -> Float -> Float +modForce f moveMax scoreMax score dist = + let fscore = fromIntegral score + fscoreMax = fromIntegral scoreMax + + naiveMove = (fscore / fscoreMax) * moveMax -- move distance before modulation + scHypotenuse = moveMax * 100.0 -- largest distance possible + moveMultiplier = f dist scHypotenuse -- apply modulating function + + in naiveMove * moveMultiplier + + +linear :: Float -> Float -> Float +linear a b = a / b + + +exponential :: Float -> Float -> Float +exponential a b = (a^2) / (b^2) + + +fracExponential :: Float -> Float -> Float +fracExponential a b = (sqrt a) / (sqrt b) + + +cacheEdges :: IO (Table Int RelationCache) +cacheEdges = do + let sel = "select id from vertex;" + allIds <- vertexIds + htbl <- M.new + mapM_ (\id -> relationSummary' id allIds >>= M.insert htbl id) allIds + return htbl + + +cacheVertices :: [Point2D] -> IO (Table Int Point2D) +cacheVertices pts = do + ids <- vertexIds + tbl <- M.new + mapM_ (\(id, pt) -> M.insert tbl id pt) $ zip ids pts + return tbl + + +-- list of all noun database ids + +vertexIds :: IO [Int] +vertexIds = let sel = "select id from vertex;" + in wrap (genericLookup sel) >>= return . fromJust + +-- number of vertices in the graph + +numVertices :: IO Int +numVertices = let sel = "select count(id) from vertex;" + in wrap (genericSingleLookup sel) >>= return . fromJust + + +-- determine the highest current relation score +scoreMax :: IO Int +scoreMax = let sel = "select score from edge order by score desc limit 1;" + in wrap (genericSingleLookup sel) >>= return . fromJust + + +relationSummary :: Int -> IO RelationCache +relationSummary id = vertexIds >>= relationSummary' id + + +relationSummary' :: Int -> [Int] -> IO RelationCache +relationSummary' id allIds = do + related <- allRelated id + unrelated <- allUnrelated' id allIds related + strengths <- mapM (strengthBetween id) related + return $ RCache strengths unrelated + + + +-- the ids of all nouns related to the given id + +allRelated :: Int -> IO [Int] +allRelated id = do + let sela = "select nodeb from edge where nodea=" ++ (show id) ++ ";" + selb = "select nodea from edge where nodeb=" ++ (show id) ++ ";" + resa <- wrap (genericLookup sela) + resb <- wrap (genericLookup selb) + return $ L.concat $ catMaybes [resa, resb] + + +allUnrelated :: Int -> IO [Int] +allUnrelated id = + let sel = "select id from vertex;" + in wrap (genericLookup sel) >>= \lst -> allRelated id >>= \excl -> allUnrelated' id (fromJust lst) excl + + +allUnrelated' :: Int -> [Int] -> [Int] -> IO [Int] +allUnrelated' id lst excl = return $ L.filter (\x -> not (x `L.elem` excl)) lst + + +strengthBetween :: Int -> Int -> IO Strength +strengthBetween a b = do + let sela = "select score from edge where nodea=" ++ (show a) ++ " and nodeb=" ++ (show b) ++ ";" + selb = "select score from edge where nodea=" ++ (show b) ++ " and nodeb=" ++ (show a) ++ ";" + resa <- wrap (genericSingleLookup sela) + resb <- wrap (genericSingleLookup selb) + return $ case (resa, resb) of + (Just score, Nothing) -> Strength b score + (Nothing, Just score) -> Strength b score + (Nothing, Nothing) -> Strength b 0 + + + +---------------------------------------------------------------------------------------------------- + +-- build a pseudo-grid (@len@ x @len@) of points given the total number of points total + +initialPositioning :: Float -> Int -> [(Float, Float)] +initialPositioning len numPts = + let rt = sqrt $ fromIntegral numPts + rndUp = ceiling rt + rndDwn = floor rt + in L.concat $ case rndUp == rndDwn of + True -> coordsPerfectSq len rndUp + False -> coordsMismatch numPts len rndDwn rndUp + + +---------------------------------------------------------------------------------------------------- + +-- construct the pseudo-grid when an integer square can be taken of the number of points + +coordsPerfectSq :: Float -> Int -> [[(Float, Float)]] +coordsPerfectSq len bound = + let mod = len / (fromIntegral bound) + coords = take bound [ mod * i | i <- [0..] ] + in L.map (L.zip coords . L.repeat) coords + + +---------------------------------------------------------------------------------------------------- + +-- construct the pseudo-grid when an integer square cannot be taken of then number of points + +coordsMismatch :: Int -> Float -> Int -> Int -> [[(Float, Float)]] +coordsMismatch pts len rndDown rndUp + | willOverflow pts rndUp = + let mod = len / (fromIntegral rndUp) + coords = take rndUp [ mod * i | i <- [0..] ] + + -- final row + countF = pts - (rndUp * (rndUp - 1)) + modF = len / (fromIntegral countF) + coordsF = take countF [ modF * i | i <- [0..] ] + + in (L.map (zip coords . repeat) $ L.init coords) ++ [(zip coordsF (repeat $ L.last coords))] + + | otherwise = + let modX = len / (fromIntegral rndDown) + modY = len / (fromIntegral rndUp) + coordsX = take rndDown [ modX * i | i <- [0..] ] + coordsY = take rndUp [ modY * i | i <- [0..] ] + + -- final row + countF = pts - (rndDown^2) + modF = len / (fromIntegral countF) + coordsFX = take countF [ modF * i | i <- [0..] ] + + in (L.map (zip coordsX . repeat) $ L.init coordsY) ++ [(zip coordsFX (repeat $ L.last coordsY))] + + + where willOverflow :: Int -> Int -> Bool + willOverflow pts u = pts > (u * (u - 1)) + diff --git a/src/Text/Nouns.hs b/src/Text/Nouns.hs index 5b9572e..6733027 100644 --- a/src/Text/Nouns.hs +++ b/src/Text/Nouns.hs @@ -1,5 +1,7 @@ module Text.Nouns ( - initialPositioning + convertDatabase +, runCluster +, initialPositioning ) where @@ -19,6 +21,7 @@ import Network.Socket import Foreign.Storable import Foreign.Marshal.Alloc import Feed +import Entry import Viterbi @@ -66,6 +69,13 @@ data RelationCache = RCache { related :: [Strength] newtype ForceFunc = ForceFunc { applyF :: (Int -> Float -> Float) } +data ClusterConfiguration = CC { _attrF :: ForceFunc + , _replF :: ForceFunc + , _edges :: Table Int RelationCache + , _verts :: Table Int Point2D + } + + ---------------------------------------------------------------------------------------------------- -- Database connection parameters (Password altered for github) @@ -87,6 +97,34 @@ connStr = "host=localhost dbname=ticket connect_timeout=7 port=5432 user=postgre -- +convertDatabase :: IO () +convertDatabase = do + vit <- trainVit + mvar <- newEmptyMVar + feeds <- liftM fromJust allFeeds + ents <- mapM (\f -> find f entries) feeds >>= return . L.concat + docs <- return $ L.map (B.pack . description) ents + forkIO $ (\mv -> documentsToGraph vit (odds docs) >> putMVar mv True) mvar + documentsToGraph vit $ evens docs + takeMVar mvar + return () + + +evens :: [a] -> [a] +evens lst = snd $ L.foldl' alternator (False, []) lst + +odds :: [a] -> [a] +odds lst = snd $ L.foldl' alternator (True, []) lst + +alternator :: (Bool, [a]) -> a -> (Bool, [a]) +alternator acc x = case fst acc of + True -> (False, [x] ++ (snd acc)) + False -> (True, snd acc) + +---------------------------------------------------------------------------------------------------- + +-- + documentsToGraph :: Vit -> [ByteString] -> IO () documentsToGraph vit docs = mapM_ (documentToGraph vit) docs @@ -234,7 +272,10 @@ nounCreate n = relCreate :: Int -> Int -> IO Int relCreate vtxa vtxb = - let ins = "insert into edge (nodea, nodeb) values (" ++ (show vtxa) ++ ", " ++ (show vtxb) ++ ");" + let ins = "insert into edge (nodea, nodeb, score) values (" + ++ (show vtxa) ++ ", " + ++ (show vtxb) ++ ", " + ++ (show 0) ++ ");" sel = "select id from edge where nodea=" ++ (show vtxa) ++ " and nodeb=" ++ (show vtxb) ++ ";" in wrap (genericCreateWid ins sel) @@ -281,30 +322,139 @@ genericCreateWid ins sel con = do -- -runCluster :: Float -> IO () -runCluster dimen = do - let moveMax = (sqrt ((dimen^2) + (dimen^2))) / 100 - nchan <- newChan - edges <- cacheEdges - count <- numVertices - verts <- cacheVertices $ initialPositioning dimen count +runCluster :: String -> IO () +runCluster src = do + case src of + "user" -> do + putStrLn ">> Enter cluster-space size." + dimen <- getLine >>= return . read + (edges, verts) <- prepareCache dimen + chan <- prepVisualOut + runCluster' chan dimen src edges verts + _ -> do + return () + + +runCluster' :: Chan (Table Int Point2D) -> Float -> String -> Table Int RelationCache -> + Table Int Point2D -> IO () +runCluster' chan dimen src edges verts = + case src of + "user" -> do + (attrF, replF) <- userConfigureForces dimen + config <- return $ CC attrF replF edges verts + enterClusterCycle chan config 0 + stat <- confirmExit + case stat of + 0 -> runCluster src + 1 -> runCluster' chan dimen src edges verts + 2 -> return () + + _ -> do + return () + + +prepareCache :: Float -> IO (Table Int RelationCache, Table Int Point2D) +prepareCache dimen = do + putStrLn ">> Preparing cache. Perform this in parallel? y/n" + input <- getLine + case affirmative input of + True -> do + putStrLn ">> Parallelizing." + mvar <- newEmptyMVar + count <- numVertices + forkIO $ (\mv -> cacheEdges >>= putMVar mv) mvar + verts <- cacheVertices $ initialPositioning dimen count + edges <- takeMVar mvar + return (edges, verts) + False -> do + putStrLn ">> Single-Threading." + count <- numVertices + verts <- cacheVertices $ initialPositioning dimen count + edges <- cacheEdges + return (edges, verts) + + +prepVisualOut :: IO (Chan (Table Int Point2D)) +prepVisualOut = do + chan <- newChan + forkIO (waitVisualizationReq chan) + return chan + + +userConfigureForces :: Float -> IO (ForceFunc, ForceFunc) +userConfigureForces dimen = do + putStrLn ">> Enter a value for the max-move divisor." + putStrLn (">> Note: Current cluster-space size is " ++ (show dimen) ++ ".") + moveDivisor <- getLine >>= return . read + + moveMax <- return $ dimen / moveDivisor scMax <- scoreMax - attrF <- return $ configAttractiveMod linearA moveMax scMax - replF <- return $ configRepulsiveMod moveMax (moveMax * 5) - forkIO $ waitVisualizationReq nchan - iterativeClustering nchan edges verts attrF replF + putStrLn ">> Enter a value for the repulsion threshold." + repulThresh <- getLine >>= return . read + + modA <- buildForce "Attractive" + modR <- buildForce "Repulsive" + + attrF <- return $ configAttractiveMod modA dimen moveMax scMax + replF <- return $ configRepulsiveMod modR moveMax repulThresh + return (attrF, replF) + + +buildForce :: String -> IO (Float -> Float -> Float) +buildForce str = do + putStrLn (">> How would you like to modulate " ++ str ++ " Forces?") + putStrLn ">> 1 : Greater with distance." + putStrLn ">> 2 : Lesser with distance." + partA <- getLine >>= return . read + putStrLn (">> Which function type should be used to modulate " ++ str ++ " Forces?") + putStrLn ">> 1 : Linear" + putStrLn ">> 2 : Exponential" + putStrLn ">> 3 : Fractinal exponential" + partB <- getLine >>= return . read + return $ case (partA, partB) of + (1, 1) -> linearGD + (1, 2) -> exponentialGD + (1, 3) -> fracExponentialGD + (2, 1) -> linearLD + (2, 2) -> exponentialLD + (2, 3) -> fracExponentialLD + + +confirmExit :: IO Int +confirmExit = do + putStrLn ">> Enter one of the following numbers to continue." + putStrLn ">> 0 : Run another cluster, update all cached data." + putStrLn ">> 1 : Run another cluster, use existing cache." + putStrLn ">> 2 : Exit" + getLine >>= return . read + + +affirmative :: String -> Bool +affirmative str = (str == "y") || (str == "Y") + + + +enterClusterCycle :: Chan (Table Int Point2D) -> + ClusterConfiguration -> + Int -> + IO () +enterClusterCycle chan config count = + case count == 100 of + True -> do + putStrLn ">> 100 cycles completed, continue? y/n" + input <- getLine + case affirmative input of + True -> enterClusterCycle chan config 0 + False -> return () + + False -> do + let (e, v, a, r) = ((_edges config), (_verts config), (_attrF config), (_replF config)) + reportNewPositions chan v + newVerts <- clusterCycle e v a r + threadDelay 500000 + enterClusterCycle chan (CC a r e newVerts) (count + 1) -iterativeClustering :: Chan (Table Int Point2D) -> - Table Int RelationCache -> - Table Int Point2D -> - ForceFunc -> - ForceFunc -> - IO () -iterativeClustering chan edges verts attrF replF = do - reportNewPositions chan verts - nVerts <- clusterCycle edges verts attrF replF - iterativeClustering chan edges nVerts attrF replF ---------------------------------------------------------------------------------------------------- @@ -356,39 +506,58 @@ applyForce :: Int -> ForceFunc -> Point2D -> Point2D -> Point2D applyForce score (ForceFunc ff) (Point2D (xa, ya)) (Point2D (xb, yb)) = let curDist = pythagoras (xa - xb) (ya - yb) move = ff score curDist - moveRatio = move / curDist - in case moveRatio == 0 of + in case move == 0 of True -> Point2D (xb, yb) - False -> Point2D ((moveRatio * (xa - xb)) + xb, (moveRatio * (ya - yb)) + yb) + False -> Point2D ((move * (xa - xb)) + xb, (move * (ya - yb)) + yb) where pythagoras :: Float -> Float -> Float pythagoras a b = sqrt ((a^2) + (b^2)) + makePt :: Float -> (Float, Float) -> (Float, Float) -> Point2D + makePt move (xa, ya) (xb, yb) = + let (xz, yz) = ((move * (xa - xb)) + xb, (move * (ya - yb)) + yb) + in Point2D (sanitize xz, sanitize yz) + + sanitize :: Float -> Float + sanitize = lwrBnd . uprBnd + + uprBnd :: Float -> Float + uprBnd x = case (x > 1000.0) of + False -> x + True -> 1000.0 + + lwrBnd :: Float -> Float + lwrBnd x = case (x < 0.0) of + False -> x + True -> 0.0 + + ---------------------------------------------------------------------------------------------------- -- configure a repulsive force function for easy passing -configRepulsiveMod :: Float -> Float -> ForceFunc -configRepulsiveMod moveMax thresh = ForceFunc (modRepulsiveForce moveMax thresh) +configRepulsiveMod :: (Float -> Float -> Float) -> Float -> Float -> ForceFunc +configRepulsiveMod f moveMax thresh = ForceFunc (modRepulsiveForce f moveMax thresh) ---------------------------------------------------------------------------------------------------- -- configure an attractive force function for easy passing -configAttractiveMod :: (Float -> Float -> Float) -> Float -> Int -> ForceFunc -configAttractiveMod f moveMax scoreMax = ForceFunc (modAttractiveForce f moveMax scoreMax) +configAttractiveMod :: (Float -> Float -> Float) -> Float -> Float -> Int -> ForceFunc +configAttractiveMod f dimen moveMax scoreMax = + ForceFunc (modAttractiveForce f dimen moveMax scoreMax) ---------------------------------------------------------------------------------------------------- -- modulate the repulsive move according to a linear function -modRepulsiveForce :: Float -> Float -> Int -> Float -> Float -modRepulsiveForce moveMax thresh _ dist = - let trespass = 1 - (dist / thresh) +modRepulsiveForce :: (Float -> Float -> Float) -> Float -> Float -> Int -> Float -> Float +modRepulsiveForce f moveMax thresh _ dist = + let trespass = f dist thresh in case dist < thresh of False -> 0 True -> trespass * moveMax @@ -398,14 +567,13 @@ modRepulsiveForce moveMax thresh _ dist = -- modulate the attractive force on a vertex according to distance and the chosen mod function -modAttractiveForce :: (Float -> Float -> Float) -> Float -> Int -> Int -> Float -> Float -modAttractiveForce f moveMax scoreMax score dist = +modAttractiveForce :: (Float -> Float -> Float) -> Float -> Float -> Int -> Int -> Float -> Float +modAttractiveForce f dimen moveMax scoreMax score dist = let fscore = fromIntegral score fscoreMax = fromIntegral scoreMax - - naiveMove = (fscore / fscoreMax) * moveMax -- move distance before modulation - scHypotenuse = moveMax * 100.0 -- largest distance possible - moveMultiplier = f dist scHypotenuse -- apply modulating function + naiveMove = (fscore / fscoreMax) * moveMax -- move distance before modulation + scHypotenuse = sqrt ((dimen^2) + (dimen^2)) -- greatest dist possible + moveMultiplier = f dist scHypotenuse -- apply modulating function in naiveMove * moveMultiplier @@ -414,24 +582,35 @@ modAttractiveForce f moveMax scoreMax score dist = -- modulating function for attractive forces, linear -linearA :: Float -> Float -> Float -linearA a b = a / b +linearGD :: Float -> Float -> Float +linearGD a b = a / b + + +linearLD :: Float -> Float -> Float +linearLD a b = 1 - (linearGD a b) ---------------------------------------------------------------------------------------------------- -- modulating function for attractive forces, exponential -exponentialA :: Float -> Float -> Float -exponentialA a b = (a^2) / (b^2) +exponentialGD :: Float -> Float -> Float +exponentialGD a b = (a^2) / (b^2) + +exponentialLD :: Float -> Float -> Float +exponentialLD a b = 1 - (exponentialGD a b) ---------------------------------------------------------------------------------------------------- -- modulating function for repulsive forces, opposite exponential -fracExponentialA :: Float -> Float -> Float -fracExponentialA a b = (sqrt a) / (sqrt b) +fracExponentialGD :: Float -> Float -> Float +fracExponentialGD a b = 1 - (fracExponentialLD a b) + + +fracExponentialLD :: Float -> Float -> Float +fracExponentialLD a b = (sqrt a) / (sqrt b) ---------------------------------------------------------------------------------------------------- @@ -443,6 +622,7 @@ cacheEdges = do let sel = "select id from vertex;" allIds <- vertexIds htbl <- M.new + mvar <- newEmptyMVar mapM_ (\id -> relationSummary' id allIds >>= M.insert htbl id) allIds return htbl @@ -635,24 +815,32 @@ reportNewPositions chan tbl = writeChan chan tbl waitVisualizationReq :: Chan (Table Int Point2D) -> IO () waitVisualizationReq chan = do - addrinfo <- getAddrInfo (Just (defaultHints {addrFlags = [AI_PASSIVE]})) Nothing (Just "8001") + --putStrLn ">> Visualizations now available" + addrinfo <- getAddrInfo (Just (defaultHints {addrFlags = [AI_PASSIVE]})) Nothing (Just "8000") servaddr <- return $ head addrinfo sock <- socket (addrFamily servaddr) Stream defaultProtocol bindSocket sock (addrAddress servaddr) listen sock 10 - visualize chan sock + serve chan sock + + +serve :: Chan (Table Int Point2D) -> Socket -> IO () +serve chan sock = do + (connSock, cliAddr) <- accept sock + forkIO (visualize chan connSock) + serve chan sock visualize :: Chan (Table Int Point2D) -> Socket -> IO () visualize chan sock = do verts <- readChan chan listv <- M.toList verts - (connSock, cliAddr) <- accept sock - h <- socketToHandle connSock ReadWriteMode + listlen <- return $ L.length listv + h <- socketToHandle sock ReadWriteMode hSetBinaryMode h True + writeBytes 4 h ((fromIntegral listlen) :: Int32) mapM_ (writePoint h) listv hClose h - visualize chan sock writePoint :: Handle -> (Int, Point2D) -> IO () diff --git a/src/Viterbi.hs b/src/Viterbi.hs index a155c9c..90edc8a 100644 --- a/src/Viterbi.hs +++ b/src/Viterbi.hs @@ -180,11 +180,22 @@ nounsAndIndices' = return . fst . (L.foldl' accFunc ([], 0)) . (L.filter filterF -- tag a given string with its parts of speech tag :: Vit -> ByteString -> IO ByteString -tag unv str = return (clear unv) >>= \v -> - liftM snd $ foldM resolve (v, B.empty) $ B.splitWith splFunc $ B.filter noPunc str +tag unv str = do + v <- return (clear unv) + clean <- return $ cleanStr $ B.splitWith splFunc str + liftM snd $ foldM resolve (v, B.empty) clean - where noPunc :: Char -> Bool - noPunc c = not (c `L.elem` punctuationMarks) + where cleanStr :: [ByteString] -> [ByteString] + cleanStr = mapMaybe rigorousFilter + + rigorousFilter :: ByteString -> Maybe ByteString + rigorousFilter str = + let np = B.filter noPunc str + in case '=' `B.elem` str of + True -> Nothing + False -> case (np == (B.pack "br")) || (np == (B.pack "hr")) of + True -> Nothing + False -> Just $ B.filter (\x -> x /= ' ') np ---------------------------------------------------------------------------------------------------- @@ -310,6 +321,14 @@ filterFunc :: ByteString -> Bool filterFunc b = (b /= B.pack " ") && (b /= B.empty) +---------------------------------------------------------------------------------------------------- + +-- determine if a char is punctuation + +noPunc :: Char -> Bool +noPunc c = not (c `L.elem` punctuationMarks) + + ---------------------------------------------------------------------------------------------------- -- reset the lastWord field of a Vit